]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgcc/config/arc/ieee-754/divdf3.S
config.host (arc*-*-elf*, [...]): New configurations.
[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / divdf3.S
1 /* Copyright (C) 2008-2012 Free Software Foundation, Inc.
2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3 on behalf of Synopsys Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 /*
27 to calculate a := b/x as b*y, with y := 1/x:
28 - x is in the range [1..2)
29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30 Precision is higher for polynoms used to evaluate input with larger
31 value.
32 - Do one newton-raphson iteration step to double the precision,
33 then multiply this with the divisor
34 -> more time to decide if dividend is subnormal
35 - the worst error propagation is on the side of the value range
36 with the least initial defect, thus giving us about 30 bits precision.
37 The truncation error for the either is less than 1 + x/2 ulp.
38 A 31 bit inverse can be simply calculated by using x with implicit 1
39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
40 with the bare fraction part of x, then add in y0^2 for the implicit
41 1 of x.
42 - If calculating a 31 bit inverse, the systematic error is less than
43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44 - If we calculate our seed with a 32 bit fraction, we can archive a
45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46 only need to take the step to calculate the 2nd stage rest and
47 rounding adjust 1/32th of the time. However, if we use a 20 bit
48 fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49 thus for a simple add / tst check, we need to do the 2nd stage
50 rest calculation/ rounding adjust 1/16th of the time.
51 (1): The inexactness of the 32 bit inverse contributes an error in the
52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
53 rest contributes an error < +1/x ulp/128 . In the interval [1,2),
54 x/2 + 1/x <= 1.5 .
55 (2): Unless proven otherwise. I have not actually looked for an
56 example where -2 ulp/128 is exceeded, and my calculations indicate
57 that the excess, if existent, is less than -1/512 ulp.
58 */
59 #include "arc-ieee-754.h"
60
61 /* N.B. fp-bit.c does double rounding on denormal numbers. */
62 #if 0 /* DEBUG */
63 .global __divdf3
64 FUNC(__divdf3)
65 .balign 4
66 __divdf3:
67 push_s blink
68 push_s r2
69 push_s r3
70 push_s r0
71 bl.d __divdf3_c
72 push_s r1
73 ld_s r2,[sp,12]
74 ld_s r3,[sp,8]
75 st_s r0,[sp,12]
76 st_s r1,[sp,8]
77 pop_s r1
78 bl.d __divdf3_asm
79 pop_s r0
80 pop_s r3
81 pop_s r2
82 pop_s blink
83 cmp r0,r2
84 cmp.eq r1,r3
85 jeq_s [blink]
86 and r12,DBL0H,DBL1H
87 bic.f 0,0x7ff80000,r12 ; both NaN -> OK
88 jeq_s [blink]
89 bl abort
90 ENDFUNC(__divdf3)
91 #define __divdf3 __divdf3_asm
92 #endif /* DEBUG */
93
94 FUNC(__divdf3)
95 __divdf3_support: /* This label makes debugger output saner. */
96 .balign 4
97 .Ldenorm_dbl1:
98 brge r6, \
99 0x43500000,.Linf_NaN ; large number / denorm -> Inf
100 bmsk.f r12,DBL1H,19
101 mov.eq r12,DBL1L
102 mov.eq DBL1L,0
103 sub.eq r7,r7,32
104 norm.f r11,r12 ; flag for x/0 -> Inf check
105 beq_s .Linf_NaN
106 mov.mi r11,0
107 add.pl r11,r11,1
108 add_s r12,r12,r12
109 asl r8,r12,r11
110 rsub r12,r11,31
111 lsr r12,DBL1L,r12
112 tst_s DBL1H,DBL1H
113 or r8,r8,r12
114 lsr r4,r8,26
115 lsr DBL1H,r8,12
116 ld.as r4,[r10,r4]
117 bxor.mi DBL1H,DBL1H,31
118 sub r11,r11,11
119 asl DBL1L,DBL1L,r11
120 sub r11,r11,1
121 mpyhu r5,r4,r8
122 sub r7,r7,r11
123 asl r4,r4,12
124 b.d .Lpast_denorm_dbl1
125 asl r7,r7,20
126 ; wb stall
127
128 .balign 4
129 .Ldenorm_dbl0:
130 bmsk.f r12,DBL0H,19
131 ; wb stall
132 mov.eq r12,DBL0L
133 sub.eq r6,r6,32
134 norm.f r11,r12 ; flag for 0/x -> 0 check
135 brge r7, \
136 0x43500000, .Lret0_NaN ; denorm/large number -> 0
137 beq_s .Lret0_NaN
138 mov.mi r11,0
139 add.pl r11,r11,1
140 asl r12,r12,r11
141 sub r6,r6,r11
142 add.f 0,r6,31
143 lsr r10,DBL0L,r6
144 mov.mi r10,0
145 add r6,r6,11+32
146 neg.f r11,r6
147 asl DBL0L,DBL0L,r11
148 mov.pl DBL0L,0
149 sub r6,r6,32-1
150 b.d .Lpast_denorm_dbl0
151 asl r6,r6,20
152
153 .Linf_NaN:
154 tst_s DBL0L,DBL0L ; 0/0 -> NaN
155 xor_s DBL1H,DBL1H,DBL0H
156 bclr.eq.f DBL0H,DBL0H,31
157 bmsk DBL0H,DBL1H,30
158 xor_s DBL0H,DBL0H,DBL1H
159 sub.eq DBL0H,DBL0H,1
160 mov_s DBL0L,0
161 j_s.d [blink]
162 or DBL0H,DBL0H,r9
163 .balign 4
164 .Lret0_NaN:
165 xor_s DBL1H,DBL1H,DBL0H
166 cmp_s r12,r9
167 mov_s DBL0L,0
168 bmsk DBL0H,DBL1H,30
169 xor_s DBL0H,DBL0H,DBL1H
170 j_s.d [blink]
171 sub.hi DBL0H,DBL0H,1
172 .Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
173 not_s DBL0L,DBL1H
174 cmp r6,r9
175 sub_s.ne DBL0L,DBL0L,DBL0L
176 tst_s DBL0H,DBL0H
177 add_s DBL0H,DBL1H,DBL0L
178 j_s.d [blink]
179 bxor.mi DBL0H,DBL0H,31
180 .Linf_nan_dbl0:
181 tst_s DBL1H,DBL1H
182 j_s.d [blink]
183 bxor.mi DBL0H,DBL0H,31
184 .balign 4
185 .global __divdf3
186 /* N.B. the spacing between divtab and the add3 to get its address must
187 be a multiple of 8. */
188 __divdf3:
189 asl r8,DBL1H,12
190 lsr r12,DBL1L,20
191 lsr r4,r8,26
192 add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
193 ld.as r4,[r10,r4]
194 ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
195 or r8,r8,r12
196 mpyhu r5,r4,r8
197 and.f r7,DBL1H,r9
198 asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
199 beq.d .Ldenorm_dbl1
200 and r6,DBL0H,r9
201 .Lpast_denorm_dbl1: ; wb stall
202 sub r4,r4,r5
203 mpyhu r5,r4,r4
204 breq.d r6,0,.Ldenorm_dbl0
205 lsr r8,r8,1
206 asl r12,DBL0H,11
207 lsr r10,DBL0L,21
208 .Lpast_denorm_dbl0: ; wb stall
209 bset r8,r8,31
210 mpyhu r11,r5,r8
211 add_s r12,r12,r10
212 bset r5,r12,31
213 cmp r5,r8
214 cmp.eq DBL0L,DBL1L
215 ; wb stall
216 lsr.cc r5,r5,1
217 sub r4,r4,r11 ; u1.31 inverse, about 30 bit
218 mpyhu r11,r5,r4 ; result fraction highpart
219 breq r7,r9,.Linf_nan_dbl1
220 lsr r8,r8,2 ; u3.29
221 add r5,r6, /* wait for immediate / XMAC wb stall */ \
222 0x3fe00000
223 ; wb stall (not for XMAC)
224 breq r6,r9,.Linf_nan_dbl0
225 mpyu r12,r11,r8 ; u-28.31
226 asl_s DBL1L,DBL1L,9 ; u-29.23:9
227 sbc r6,r5,r7
228 ; resource conflict (not for XMAC)
229 mpyhu r5,r11,DBL1L ; u-28.23:9
230 add.cs DBL0L,DBL0L,DBL0L
231 asl_s DBL0L,DBL0L,6 ; u-26.25:7
232 asl r10,r11,23
233 sub_l DBL0L,DBL0L,r12
234 ; wb stall (before 'and' for XMAC)
235 lsr r7,r11,9
236 sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
237 mpyh r12,r5,r4 ; result fraction lowpart
238 xor.f 0,DBL0H,DBL1H
239 and DBL0H,r6,r9
240 add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
241 bxor.mi DBL0H,DBL0H,31
242 brhs r6, /* wb stall / wait for immediate */ \
243 0x7fe00000,.Linf_denorm
244 add.f r12,r12,0x11
245 asr r9,r12,5
246 sub.mi DBL0H,DBL0H,1
247 add.f DBL0L,r9,r10
248 tst r12,0x1c
249 jne.d [blink]
250 add.cs DBL0H,DBL0H,1
251 /* work out exact rounding if we fall through here. */
252 /* We know that the exact result cannot be represented in double
253 precision. Find the mid-point between the two nearest
254 representable values, multiply with the divisor, and check if
255 the result is larger than the dividend. Since we want to know
256 only the sign bit, it is sufficient to calculate only the
257 highpart of the lower 64 bits. */
258 sub.f DBL0L,DBL0L,1
259 asl r12,r9,2 ; u-22.30:2
260 mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
261 sub.cs DBL0H,DBL0H,1
262 sub.f r12,r12,2
263 ; resource conflict (not for XMAC)
264 mpyhu r7,r12,DBL1L ; u-51.32
265 asl r5,r5,25 ; s-51.7:25
266 lsr r10,r10,7 ; u-51.30:2
267 ; resource conflict (not for XMAC)
268 ; resource conflict (not for XMAC)
269 mpyu r9,r12,r8 ; u-51.31:1
270 sub r5,r5,r10
271 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
272 bset r7,r7,0 ; make sure that the result is not zero, and that
273 ; wb stall (one earlier for XMAC)
274 sub r5,r5,r7 ; a highpart zero appears negative
275 sub.f r5,r5,r9 ; rest msw
276 add.pl.f DBL0L,DBL0L,1
277 j_s.d [blink]
278 add.eq DBL0H,DBL0H,1
279
280 .balign 4
281 .Linf_denorm:
282 brlo r6,0xc0000000,.Linf
283 .Ldenorm:
284 asr r6,r6,20
285 neg r9,r6
286 mov_s DBL0H,0
287 brhs.d r9,54,.Lret0
288 bxor.mi DBL0H,DBL0H,31
289 add_l r12,r12,1
290 and r12,r12,-4
291 rsub r7,r6,5
292 asr r10,r12,28
293 bmsk r4,r12,27
294 asrs DBL0L,r4,r7
295 add DBL1H,r11,r10
296 add.f r7,r6,32-5
297 abss r10,r4
298 asl r4,r4,r7
299 mov.mi r4,r10
300 add.f r10,r6,23
301 rsub r7,r6,9
302 lsr r7,DBL1H,r7
303 asl r10,DBL1H,r10
304 or.pnz DBL0H,DBL0H,r7
305 or.mi r4,r4,r10
306 mov.mi r10,r7
307 add.f DBL0L,r10,DBL0L
308 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
309 bxor.f 0,r4,31
310 add.pnz.f DBL0L,DBL0L,1
311 add.cs.f DBL0H,DBL0H,1
312 jne_l [blink]
313 /* Calculation so far was not conclusive; calculate further rest. */
314 mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
315 asr.f r12,r12,3
316 asl r5,r5,25 ; s-51.7:25
317 ; resource conflict (not for XMAC)
318 mpyu DBL1H,r12,r8 ; u-51.31:1
319 and r9,DBL0L,1 ; tie-breaker: round to even
320 lsr r11,r11,7 ; u-51.30:2
321 ; resource conflict (not for XMAC)
322 mpyhu r8,r12,DBL1L ; u-51.32
323 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
324 add_s DBL1H,DBL1H,r11
325 ; resource conflict (not for XMAC)
326 ; resource conflict (not for XMAC)
327 mpyu r12,r12,DBL1L ; u-83.30:2
328 sub DBL1H,DBL1H,r5 ; -rest msw
329 add_s DBL1H,DBL1H,r8 ; -rest msw
330 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
331 ; wb stall (XMAC: Before add.f)
332 tst_s DBL1H,DBL1H
333 cmp.eq r12,r9
334 add.cs.f DBL0L,DBL0L,1
335 j_s.d [blink]
336 add.cs DBL0H,DBL0H,1
337
338 .Lret0:
339 /* return +- 0 */
340 j_s.d [blink]
341 mov_s DBL0L,0
342 .Linf:
343 mov_s DBL0H,r9
344 mov_s DBL0L,0
345 j_s.d [blink]
346 bxor.mi DBL0H,DBL0H,31
347
348 .balign 4
349 .Ldivtab:
350 .long 0xfc0fffe1
351 .long 0xf46ffdfb
352 .long 0xed1ffa54
353 .long 0xe61ff515
354 .long 0xdf7fee75
355 .long 0xd91fe680
356 .long 0xd2ffdd52
357 .long 0xcd1fd30c
358 .long 0xc77fc7cd
359 .long 0xc21fbbb6
360 .long 0xbcefaec0
361 .long 0xb7efa100
362 .long 0xb32f92bf
363 .long 0xae8f83b7
364 .long 0xaa2f7467
365 .long 0xa5ef6479
366 .long 0xa1cf53fa
367 .long 0x9ddf433e
368 .long 0x9a0f3216
369 .long 0x965f2091
370 .long 0x92df0f11
371 .long 0x8f6efd05
372 .long 0x8c1eeacc
373 .long 0x88eed876
374 .long 0x85dec615
375 .long 0x82eeb3b9
376 .long 0x800ea10b
377 .long 0x7d3e8e0f
378 .long 0x7a8e7b3f
379 .long 0x77ee6836
380 .long 0x756e5576
381 .long 0x72fe4293
382 .long 0x709e2f93
383 .long 0x6e4e1c7f
384 .long 0x6c0e095e
385 .long 0x69edf6c5
386 .long 0x67cde3a5
387 .long 0x65cdd125
388 .long 0x63cdbe25
389 .long 0x61ddab3f
390 .long 0x600d991f
391 .long 0x5e3d868c
392 .long 0x5c6d7384
393 .long 0x5abd615f
394 .long 0x590d4ecd
395 .long 0x576d3c83
396 .long 0x55dd2a89
397 .long 0x545d18e9
398 .long 0x52dd06e9
399 .long 0x516cf54e
400 .long 0x4ffce356
401 .long 0x4e9cd1ce
402 .long 0x4d3cbfec
403 .long 0x4becae86
404 .long 0x4aac9da4
405 .long 0x496c8c73
406 .long 0x483c7bd3
407 .long 0x470c6ae8
408 .long 0x45dc59af
409 .long 0x44bc4915
410 .long 0x43ac3924
411 .long 0x428c27fb
412 .long 0x418c187a
413 .long 0x407c07bd
414 .L7ff00000:
415 .long 0x7ff00000
416 ENDFUNC(__divdf3)