[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / arc600-mul64 / divdf3.S

/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
		on behalf of Synopsys Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
   to calculate a := b/x as b*y, with y := 1/x:
   - x is in the range [1..2)
   - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
     Precision is higher for polynoms used to evaluate input with larger
     value.
   - Do one newton-raphson iteration step to double the precision,
     then multiply this with the divisor
	-> more time to decide if dividend is subnormal
     - the worst error propagation is on the side of the value range
       with the least initial defect, thus giving us about 30 bits precision.
      The truncation error for the either is less than 1 + x/2 ulp.
      A 31 bit inverse can be simply calculated by using x with implicit 1
      and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
      with the bare fraction part of x, then add in y0^2 for the implicit
      1 of x.
    - If calculating a 31 bit inverse, the systematic error is less than
      -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
    - If we calculate our seed with a 32 bit fraction, we can archive a
      tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
      only need to take the step to calculate the 2nd stage rest and
      rounding adjust 1/32th of the time.  However, if we use a 20 bit
      fraction for the seed, the negative error can exceed -2 ulp/128, (2)
      thus for a simple add / tst check, we need to do the 2nd stage
      rest calculation/ rounding adjust 1/16th of the time.
      (1): The inexactness of the 32 bit inverse contributes an error in the
      range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
      rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
      x/2 + 1/x <= 1.5 .
      (2): Unless proven otherwise.  I have not actually looked for an
      example where -2 ulp/128 is exceeded, and my calculations indicate
      that the excess, if existent, is less than -1/512 ulp.
    ??? The algorithm is still based on the ARC700 optimized code.
    Maybe we could make better use of 64 bit multiply results and/or mmed .
 */
#include "../arc-ieee-754.h"

/* N.B. fp-bit.c does double rounding on denormal numbers.  */
#if 0 /* DEBUG */
	.global __divdf3
	FUNC(__divdf3)
	.balign 4
__divdf3:
	push_s blink
	push_s r2
	push_s r3
	push_s r0
	bl.d __divdf3_c
	push_s r1
	ld_s r2,[sp,12]
	ld_s r3,[sp,8]
	st_s r0,[sp,12]
	st_s r1,[sp,8]
	pop_s r1
	bl.d __divdf3_asm
	pop_s r0
	pop_s r3
	pop_s r2
	pop_s blink
	cmp r0,r2
	cmp.eq r1,r3
	jeq_s [blink]
	and r12,DBL0H,DBL1H
	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
	jeq_s [blink]
	bl abort
	ENDFUNC(__divdf3)
#define __divdf3 __divdf3_asm
#endif /* DEBUG */

	FUNC(__divdf3)
	.balign 4
.L7ff00000:
	.long 0x7ff00000
.Ldivtab:
	.long 0xfc0fffe1
	.long 0xf46ffdfb
	.long 0xed1ffa54
	.long 0xe61ff515
	.long 0xdf7fee75
	.long 0xd91fe680
	.long 0xd2ffdd52
	.long 0xcd1fd30c
	.long 0xc77fc7cd
	.long 0xc21fbbb6
	.long 0xbcefaec0
	.long 0xb7efa100
	.long 0xb32f92bf
	.long 0xae8f83b7
	.long 0xaa2f7467
	.long 0xa5ef6479
	.long 0xa1cf53fa
	.long 0x9ddf433e
	.long 0x9a0f3216
	.long 0x965f2091
	.long 0x92df0f11
	.long 0x8f6efd05
	.long 0x8c1eeacc
	.long 0x88eed876
	.long 0x85dec615
	.long 0x82eeb3b9
	.long 0x800ea10b
	.long 0x7d3e8e0f
	.long 0x7a8e7b3f
	.long 0x77ee6836
	.long 0x756e5576
	.long 0x72fe4293
	.long 0x709e2f93
	.long 0x6e4e1c7f
	.long 0x6c0e095e
	.long 0x69edf6c5
	.long 0x67cde3a5
	.long 0x65cdd125
	.long 0x63cdbe25
	.long 0x61ddab3f
	.long 0x600d991f
	.long 0x5e3d868c
	.long 0x5c6d7384
	.long 0x5abd615f
	.long 0x590d4ecd
	.long 0x576d3c83
	.long 0x55dd2a89
	.long 0x545d18e9
	.long 0x52dd06e9
	.long 0x516cf54e
	.long 0x4ffce356
	.long 0x4e9cd1ce
	.long 0x4d3cbfec
	.long 0x4becae86
	.long 0x4aac9da4
	.long 0x496c8c73
	.long 0x483c7bd3
	.long 0x470c6ae8
	.long 0x45dc59af
	.long 0x44bc4915
	.long 0x43ac3924
	.long 0x428c27fb
	.long 0x418c187a
	.long 0x407c07bd

__divdf3_support: /* This label makes debugger output saner.  */
	.balign 4
.Ldenorm_dbl1:
	brge r6, \
		0x43500000,.Linf_NaN ; large number / denorm -> Inf
	bmsk.f r12,DBL1H,19
	mov.eq r12,DBL1L
	mov.eq DBL1L,0
	sub.eq r7,r7,32
	norm.f r11,r12 ; flag for x/0 -> Inf check
	beq_s .Linf_NaN
	mov.mi r11,0
	add.pl r11,r11,1
	add_s r12,r12,r12
	asl r8,r12,r11
	rsub r12,r11,31
	lsr r12,DBL1L,r12
	tst_s DBL1H,DBL1H
	or r8,r8,r12
	lsr r4,r8,26
	lsr DBL1H,r8,12
	ld.as r4,[r10,r4]
	bxor.mi DBL1H,DBL1H,31
	sub r11,r11,11
	asl DBL1L,DBL1L,r11
	sub r11,r11,1
	mulu64 r4,r8
	sub r7,r7,r11
	b.d .Lpast_denorm_dbl1
	asl r7,r7,20

	.balign 4
.Ldenorm_dbl0:
	bmsk.f r12,DBL0H,19
	; wb stall
	mov.eq r12,DBL0L
	sub.eq r6,r6,32
	norm.f r11,r12 ; flag for 0/x -> 0 check
	brge r7, \
		0x43500000, .Lret0_2 ; denorm/large number -> 0
	beq_s .Lret0_2
	mov.mi r11,0
	add.pl r11,r11,1
	asl r12,r12,r11
	sub r6,r6,r11
	add.f 0,r6,31
	lsr r10,DBL0L,r6
	mov.mi r10,0
	add r6,r6,11+32
	neg.f r11,r6
	asl DBL0L,DBL0L,r11
	mov.pl DBL0L,0
	sub r6,r6,32-1
	b.d .Lpast_denorm_dbl0
	asl r6,r6,20

.Linf_NaN:
	tst_s DBL0L,DBL0L ; 0/0 -> NaN
	xor_s DBL1H,DBL1H,DBL0H
	bclr.eq.f DBL0H,DBL0H,31
	bmsk DBL0H,DBL1H,30
	xor_s DBL0H,DBL0H,DBL1H
	sub.eq DBL0H,DBL0H,1
	mov_s DBL0L,0
	j_s.d [blink]
	or DBL0H,DBL0H,r9
	.balign 4
.Lret0_2:
	xor_s DBL1H,DBL1H,DBL0H
	mov_s DBL0L,0
	bmsk DBL0H,DBL1H,30
	j_s.d [blink]
	xor_s DBL0H,DBL0H,DBL1H
	.balign 4
	.global __divdf3
/* N.B. the spacing between divtab and the sub3 to get its address must
   be a multiple of 8.  */
__divdf3:
	asl r8,DBL1H,12
	lsr r4,r8,26
	sub3 r10,pcl,61; (.-.Ldivtab) >> 3
	ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
	ld.as r4,[r10,r4]
	lsr r12,DBL1L,20
	and.f r7,DBL1H,r9
	or r8,r8,r12
	mulu64 r4,r8
	beq.d .Ldenorm_dbl1
.Lpast_denorm_dbl1:
	and.f r6,DBL0H,r9
	breq.d r7,r9,.Linf_nan_dbl1
	asl r4,r4,12
	sub r4,r4,mhi
	mulu64 r4,r4
	beq.d .Ldenorm_dbl0
	lsr r8,r8,1
	breq.d r6,r9,.Linf_nan_dbl0
	asl r12,DBL0H,11
	lsr r10,DBL0L,21
.Lpast_denorm_dbl0:
	bset r8,r8,31
	mulu64 mhi,r8
	add_s r12,r12,r10
	bset r5,r12,31
	cmp r5,r8
	cmp.eq DBL0L,DBL1L
	lsr.cc r5,r5,1
	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
	mulu64 r5,r4 ; result fraction highpart
	lsr r8,r8,2 ; u3.29
	add r5,r6, /* wait for immediate */ \
		0x3fe00000
	mov r11,mhi ; result fraction highpart
	mulu64 r11,r8 ; u-28.31
	asl_s DBL1L,DBL1L,9 ; u-29.23:9
	sbc r6,r5,r7
	mov r12,mlo ; u-28.31
	mulu64 r11,DBL1L ; mhi: u-28.23:9
	add.cs DBL0L,DBL0L,DBL0L
	asl_s DBL0L,DBL0L,6 ; u-26.25:7
	asl r10,r11,23
	sub_l DBL0L,DBL0L,r12
	lsr r7,r11,9
	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
	mul64 r5,r4 ; mhi: result fraction lowpart
	xor.f 0,DBL0H,DBL1H
	and DBL0H,r6,r9
	add_s DBL0H,DBL0H,r7
	bclr r12,r9,20 ; 0x7fe00000
	brhs.d r6,r12,.Linf_denorm
	bxor.mi DBL0H,DBL0H,31
	add.f r12,mhi,0x11
	asr r9,r12,5
	sub.mi DBL0H,DBL0H,1
	add.f DBL0L,r9,r10
	tst r12,0x1c
	jne.d [blink]
	add.cs DBL0H,DBL0H,1
        /* work out exact rounding if we fall through here.  */
        /* We know that the exact result cannot be represented in double
           precision.  Find the mid-point between the two nearest
           representable values, multiply with the divisor, and check if
           the result is larger than the dividend.  Since we want to know
	   only the sign bit, it is sufficient to calculate only the
	   highpart of the lower 64 bits.  */
	mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
	sub.f DBL0L,DBL0L,1
	asl r12,r9,2 ; u-22.30:2
	sub.cs DBL0H,DBL0H,1
	sub.f r12,r12,2
	mov r10,mlo ; rest before considering r12 in r5 : -r10
	mulu64 r12,DBL1L ; mhi: u-51.32
	asl r5,r5,25 ; s-51.7:25
	lsr r10,r10,7 ; u-51.30:2
	mov r7,mhi ; u-51.32
	mulu64 r12,r8 ; mlo: u-51.31:1
	sub r5,r5,r10
	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
	bset r7,r7,0 ; make sure that the result is not zero, and that
	sub r5,r5,r7 ; a highpart zero appears negative
	sub.f r5,r5,mlo ; rest msw
	add.pl.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.eq DBL0H,DBL0H,1

.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
	or.f 0,r6,DBL0L
	cmp.ne r6,r9
	not_s DBL0L,DBL1H
	sub_s.ne DBL0L,DBL0L,DBL0L
	tst_s DBL0H,DBL0H
	add_s DBL0H,DBL1H,DBL0L
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
.Linf_nan_dbl0:
	tst_s DBL1H,DBL1H
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	.balign 4
.Linf_denorm:
	lsr r12,r6,28
	brlo.d r12,0xc,.Linf
.Ldenorm:
	asr r6,r6,20
	neg r9,r6
	mov_s DBL0H,0
	brhs.d r9,54,.Lret0
	bxor.mi DBL0H,DBL0H,31
	add r12,mhi,1
	and r12,r12,-4
	rsub r7,r6,5
	asr r10,r12,28
	bmsk r4,r12,27
	min r7,r7,31
	asr DBL0L,r4,r7
	add DBL1H,r11,r10
	abs.f r10,r4
	sub.mi r10,r10,1
	add.f r7,r6,32-5
	asl r4,r4,r7
	mov.mi r4,r10
	add.f r10,r6,23
	rsub r7,r6,9
	lsr r7,DBL1H,r7
	asl r10,DBL1H,r10
	or.pnz DBL0H,DBL0H,r7
	or.mi r4,r4,r10
	mov.mi r10,r7
	add.f DBL0L,r10,DBL0L
	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
	bxor.f 0,r4,31
	add.pnz.f DBL0L,DBL0L,1
	add.cs.f DBL0H,DBL0H,1
	jne_s [blink]
	/* Calculation so far was not conclusive; calculate further rest.  */
	mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
	asr.f r12,r12,3
	asl r5,r5,25 ; s-51.7:25
	mov r11,mlo ; rest before considering r12 in r5 : -r11
	mulu64 r12,r8 ; u-51.31:1
	and r9,DBL0L,1 ; tie-breaker: round to even
	lsr r11,r11,7 ; u-51.30:2
	mov DBL1H,mlo ; u-51.31:1
	mulu64 r12,DBL1L ; u-51.62:2
	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
	add_s DBL1H,DBL1H,r11
	sub DBL1H,DBL1H,r5 ; -rest msw
	add_s DBL1H,DBL1H,mhi ; -rest msw
	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
	tst_s DBL1H,DBL1H
	cmp.eq mlo,r9
	add.cs.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.cs DBL0H,DBL0H,1

.Lret0:
	/* return +- 0 */
	j_s.d [blink]
	mov_s DBL0L,0
.Linf:
	mov_s DBL0H,r9
	mov_s DBL0L,0
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	ENDFUNC(__divdf3)
Commit	Line	Data
83ffe9cd	1	/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
d38a64b4 JR	2	Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
	3	on behalf of Synopsys Inc.
	4
	5	This file is part of GCC.
	6
	7	GCC is free software; you can redistribute it and/or modify it under
	8	the terms of the GNU General Public License as published by the Free
	9	Software Foundation; either version 3, or (at your option) any later
	10	version.
	11
	12	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	14	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	15	for more details.
	16
	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
	25
	26	/*
	27	to calculate a := b/x as b*y, with y := 1/x:
	28	- x is in the range [1..2)
	29	- calculate 15..18 bit inverse y0 using a table of approximating polynoms.
	30	Precision is higher for polynoms used to evaluate input with larger
	31	value.
	32	- Do one newton-raphson iteration step to double the precision,
	33	then multiply this with the divisor
	34	-> more time to decide if dividend is subnormal
	35	- the worst error propagation is on the side of the value range
	36	with the least initial defect, thus giving us about 30 bits precision.
	37	The truncation error for the either is less than 1 + x/2 ulp.
	38	A 31 bit inverse can be simply calculated by using x with implicit 1
	39	and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
	40	with the bare fraction part of x, then add in y0^2 for the implicit
	41	1 of x.
	42	- If calculating a 31 bit inverse, the systematic error is less than
	43	-1 ulp; likewise, for 32 bit, it is less than -2 ulp.
	44	- If we calculate our seed with a 32 bit fraction, we can archive a
	45	tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
	46	only need to take the step to calculate the 2nd stage rest and
	47	rounding adjust 1/32th of the time. However, if we use a 20 bit
	48	fraction for the seed, the negative error can exceed -2 ulp/128, (2)
	49	thus for a simple add / tst check, we need to do the 2nd stage
	50	rest calculation/ rounding adjust 1/16th of the time.
	51	(1): The inexactness of the 32 bit inverse contributes an error in the
	52	range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
	53	rest contributes an error < +1/x ulp/128 . In the interval [1,2),
	54	x/2 + 1/x <= 1.5 .
	55	(2): Unless proven otherwise. I have not actually looked for an
	56	example where -2 ulp/128 is exceeded, and my calculations indicate
	57	that the excess, if existent, is less than -1/512 ulp.
	58	??? The algorithm is still based on the ARC700 optimized code.
	59	Maybe we could make better use of 64 bit multiply results and/or mmed .
	60	*/
	61	#include "../arc-ieee-754.h"
	62
	63	/* N.B. fp-bit.c does double rounding on denormal numbers. */
	64	#if 0 /* DEBUG */
	65	.global __divdf3
66	FUNC(__divdf3)
67	.balign 4
68	__divdf3:
69	push_s blink
70	push_s r2
71	push_s r3
72	push_s r0
73	bl.d __divdf3_c
74	push_s r1
75	ld_s r2,[sp,12]
76	ld_s r3,[sp,8]
77	st_s r0,[sp,12]
78	st_s r1,[sp,8]
79	pop_s r1
80	bl.d __divdf3_asm
81	pop_s r0
82	pop_s r3
83	pop_s r2
84	pop_s blink
85	cmp r0,r2
86	cmp.eq r1,r3
87	jeq_s [blink]
88	and r12,DBL0H,DBL1H
89	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
90	jeq_s [blink]
91	bl abort
92	ENDFUNC(__divdf3)
93	#define __divdf3 __divdf3_asm
94	#endif /* DEBUG */
95
96	FUNC(__divdf3)
97	.balign 4
98	.L7ff00000:
99	.long 0x7ff00000
100	.Ldivtab:
101	.long 0xfc0fffe1
102	.long 0xf46ffdfb
103	.long 0xed1ffa54
104	.long 0xe61ff515
105	.long 0xdf7fee75
106	.long 0xd91fe680
107	.long 0xd2ffdd52
108	.long 0xcd1fd30c
109	.long 0xc77fc7cd
110	.long 0xc21fbbb6
111	.long 0xbcefaec0
112	.long 0xb7efa100
113	.long 0xb32f92bf
114	.long 0xae8f83b7
115	.long 0xaa2f7467
116	.long 0xa5ef6479
117	.long 0xa1cf53fa
118	.long 0x9ddf433e
119	.long 0x9a0f3216
120	.long 0x965f2091
121	.long 0x92df0f11
122	.long 0x8f6efd05
123	.long 0x8c1eeacc
124	.long 0x88eed876
125	.long 0x85dec615
126	.long 0x82eeb3b9
127	.long 0x800ea10b
128	.long 0x7d3e8e0f
129	.long 0x7a8e7b3f
130	.long 0x77ee6836
131	.long 0x756e5576
132	.long 0x72fe4293
133	.long 0x709e2f93
134	.long 0x6e4e1c7f
135	.long 0x6c0e095e
136	.long 0x69edf6c5
137	.long 0x67cde3a5
138	.long 0x65cdd125
139	.long 0x63cdbe25
140	.long 0x61ddab3f
141	.long 0x600d991f
142	.long 0x5e3d868c
143	.long 0x5c6d7384
144	.long 0x5abd615f
145	.long 0x590d4ecd
146	.long 0x576d3c83
147	.long 0x55dd2a89
148	.long 0x545d18e9
149	.long 0x52dd06e9
150	.long 0x516cf54e
151	.long 0x4ffce356
152	.long 0x4e9cd1ce
153	.long 0x4d3cbfec
154	.long 0x4becae86
155	.long 0x4aac9da4
156	.long 0x496c8c73
157	.long 0x483c7bd3
158	.long 0x470c6ae8
159	.long 0x45dc59af
160	.long 0x44bc4915
161	.long 0x43ac3924
162	.long 0x428c27fb
163	.long 0x418c187a
164	.long 0x407c07bd
165
166	__divdf3_support: /* This label makes debugger output saner. */
167	.balign 4
168	.Ldenorm_dbl1:
169	brge r6, \
170	0x43500000,.Linf_NaN ; large number / denorm -> Inf
171	bmsk.f r12,DBL1H,19
172	mov.eq r12,DBL1L
173	mov.eq DBL1L,0
174	sub.eq r7,r7,32
175	norm.f r11,r12 ; flag for x/0 -> Inf check
176	beq_s .Linf_NaN
177	mov.mi r11,0
178	add.pl r11,r11,1
179	add_s r12,r12,r12
180	asl r8,r12,r11
181	rsub r12,r11,31
182	lsr r12,DBL1L,r12
183	tst_s DBL1H,DBL1H
184	or r8,r8,r12
185	lsr r4,r8,26
186	lsr DBL1H,r8,12
187	ld.as r4,[r10,r4]
188	bxor.mi DBL1H,DBL1H,31
189	sub r11,r11,11
190	asl DBL1L,DBL1L,r11
191	sub r11,r11,1
192	mulu64 r4,r8
193	sub r7,r7,r11
194	b.d .Lpast_denorm_dbl1
195	asl r7,r7,20
196
197	.balign 4
198	.Ldenorm_dbl0:
199	bmsk.f r12,DBL0H,19
200	; wb stall
201	mov.eq r12,DBL0L
202	sub.eq r6,r6,32
203	norm.f r11,r12 ; flag for 0/x -> 0 check
204	brge r7, \
205	0x43500000, .Lret0_2 ; denorm/large number -> 0
206	beq_s .Lret0_2
207	mov.mi r11,0
208	add.pl r11,r11,1
209	asl r12,r12,r11
210	sub r6,r6,r11
211	add.f 0,r6,31
212	lsr r10,DBL0L,r6
213	mov.mi r10,0
214	add r6,r6,11+32
215	neg.f r11,r6
216	asl DBL0L,DBL0L,r11
217	mov.pl DBL0L,0
218	sub r6,r6,32-1
219	b.d .Lpast_denorm_dbl0
220	asl r6,r6,20
221
222	.Linf_NaN:
223	tst_s DBL0L,DBL0L ; 0/0 -> NaN
224	xor_s DBL1H,DBL1H,DBL0H
225	bclr.eq.f DBL0H,DBL0H,31
226	bmsk DBL0H,DBL1H,30
227	xor_s DBL0H,DBL0H,DBL1H
228	sub.eq DBL0H,DBL0H,1
229	mov_s DBL0L,0
230	j_s.d [blink]
231	or DBL0H,DBL0H,r9
232	.balign 4
233	.Lret0_2:
234	xor_s DBL1H,DBL1H,DBL0H
235	mov_s DBL0L,0
236	bmsk DBL0H,DBL1H,30
237	j_s.d [blink]
238	xor_s DBL0H,DBL0H,DBL1H
239	.balign 4
240	.global __divdf3
241	/* N.B. the spacing between divtab and the sub3 to get its address must
242	be a multiple of 8. */
243	__divdf3:
244	asl r8,DBL1H,12
245	lsr r4,r8,26
246	sub3 r10,pcl,61; (.-.Ldivtab) >> 3
247	ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
248	ld.as r4,[r10,r4]
249	lsr r12,DBL1L,20
250	and.f r7,DBL1H,r9
251	or r8,r8,r12
252	mulu64 r4,r8
253	beq.d .Ldenorm_dbl1
254	.Lpast_denorm_dbl1:
255	and.f r6,DBL0H,r9
256	breq.d r7,r9,.Linf_nan_dbl1
257	asl r4,r4,12
258	sub r4,r4,mhi
259	mulu64 r4,r4
260	beq.d .Ldenorm_dbl0
261	lsr r8,r8,1
262	breq.d r6,r9,.Linf_nan_dbl0
263	asl r12,DBL0H,11
264	lsr r10,DBL0L,21
265	.Lpast_denorm_dbl0:
266	bset r8,r8,31
267	mulu64 mhi,r8
268	add_s r12,r12,r10
269	bset r5,r12,31
270	cmp r5,r8
271	cmp.eq DBL0L,DBL1L
272	lsr.cc r5,r5,1
273	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
274	mulu64 r5,r4 ; result fraction highpart
275	lsr r8,r8,2 ; u3.29
276	add r5,r6, /* wait for immediate */ \
277	0x3fe00000
278	mov r11,mhi ; result fraction highpart
279	mulu64 r11,r8 ; u-28.31
280	asl_s DBL1L,DBL1L,9 ; u-29.23:9
281	sbc r6,r5,r7
282	mov r12,mlo ; u-28.31
283	mulu64 r11,DBL1L ; mhi: u-28.23:9
284	add.cs DBL0L,DBL0L,DBL0L
285	asl_s DBL0L,DBL0L,6 ; u-26.25:7
286	asl r10,r11,23
287	sub_l DBL0L,DBL0L,r12
288	lsr r7,r11,9
289	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
290	mul64 r5,r4 ; mhi: result fraction lowpart
291	xor.f 0,DBL0H,DBL1H
292	and DBL0H,r6,r9
293	add_s DBL0H,DBL0H,r7
294	bclr r12,r9,20 ; 0x7fe00000
295	brhs.d r6,r12,.Linf_denorm
296	bxor.mi DBL0H,DBL0H,31
297	add.f r12,mhi,0x11
298	asr r9,r12,5
299	sub.mi DBL0H,DBL0H,1
300	add.f DBL0L,r9,r10
301	tst r12,0x1c
302	jne.d [blink]
303	add.cs DBL0H,DBL0H,1
304	/* work out exact rounding if we fall through here. */
305	/* We know that the exact result cannot be represented in double
306	precision. Find the mid-point between the two nearest
307	representable values, multiply with the divisor, and check if
308	the result is larger than the dividend. Since we want to know
309	only the sign bit, it is sufficient to calculate only the
310	highpart of the lower 64 bits. */
311	mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
312	sub.f DBL0L,DBL0L,1
313	asl r12,r9,2 ; u-22.30:2
314	sub.cs DBL0H,DBL0H,1
315	sub.f r12,r12,2
316	mov r10,mlo ; rest before considering r12 in r5 : -r10
317	mulu64 r12,DBL1L ; mhi: u-51.32
318	asl r5,r5,25 ; s-51.7:25
319	lsr r10,r10,7 ; u-51.30:2
320	mov r7,mhi ; u-51.32
321	mulu64 r12,r8 ; mlo: u-51.31:1
322	sub r5,r5,r10
323	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
324	bset r7,r7,0 ; make sure that the result is not zero, and that
325	sub r5,r5,r7 ; a highpart zero appears negative
326	sub.f r5,r5,mlo ; rest msw
327	add.pl.f DBL0L,DBL0L,1
328	j_s.d [blink]
329	add.eq DBL0H,DBL0H,1
330
331	.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
332	or.f 0,r6,DBL0L
333	cmp.ne r6,r9
334	not_s DBL0L,DBL1H
335	sub_s.ne DBL0L,DBL0L,DBL0L
336	tst_s DBL0H,DBL0H
337	add_s DBL0H,DBL1H,DBL0L
338	j_s.d [blink]
339	bxor.mi DBL0H,DBL0H,31
340	.Linf_nan_dbl0:
341	tst_s DBL1H,DBL1H
342	j_s.d [blink]
343	bxor.mi DBL0H,DBL0H,31
344	.balign 4
345	.Linf_denorm:
346	lsr r12,r6,28
347	brlo.d r12,0xc,.Linf
348	.Ldenorm:
349	asr r6,r6,20
350	neg r9,r6
351	mov_s DBL0H,0
352	brhs.d r9,54,.Lret0
353	bxor.mi DBL0H,DBL0H,31
354	add r12,mhi,1
355	and r12,r12,-4
356	rsub r7,r6,5
357	asr r10,r12,28
358	bmsk r4,r12,27
359	min r7,r7,31
360	asr DBL0L,r4,r7
361	add DBL1H,r11,r10
362	abs.f r10,r4
363	sub.mi r10,r10,1
364	add.f r7,r6,32-5
365	asl r4,r4,r7
366	mov.mi r4,r10
367	add.f r10,r6,23
368	rsub r7,r6,9
369	lsr r7,DBL1H,r7
370	asl r10,DBL1H,r10
371	or.pnz DBL0H,DBL0H,r7
372	or.mi r4,r4,r10
373	mov.mi r10,r7
374	add.f DBL0L,r10,DBL0L
375	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
376	bxor.f 0,r4,31
377	add.pnz.f DBL0L,DBL0L,1
378	add.cs.f DBL0H,DBL0H,1
379	jne_s [blink]
380	/* Calculation so far was not conclusive; calculate further rest. */
381	mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
382	asr.f r12,r12,3
383	asl r5,r5,25 ; s-51.7:25
384	mov r11,mlo ; rest before considering r12 in r5 : -r11
385	mulu64 r12,r8 ; u-51.31:1
386	and r9,DBL0L,1 ; tie-breaker: round to even
387	lsr r11,r11,7 ; u-51.30:2
388	mov DBL1H,mlo ; u-51.31:1
389	mulu64 r12,DBL1L ; u-51.62:2
390	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
391	add_s DBL1H,DBL1H,r11
392	sub DBL1H,DBL1H,r5 ; -rest msw
393	add_s DBL1H,DBL1H,mhi ; -rest msw
394	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
395	tst_s DBL1H,DBL1H
396	cmp.eq mlo,r9
397	add.cs.f DBL0L,DBL0L,1
398	j_s.d [blink]
399	add.cs DBL0H,DBL0H,1
400
401	.Lret0:
402	/* return +- 0 */
403	j_s.d [blink]
404	mov_s DBL0L,0
405	.Linf:
406	mov_s DBL0H,r9
407	mov_s DBL0L,0
408	j_s.d [blink]
409	bxor.mi DBL0H,DBL0H,31
410	ENDFUNC(__divdf3)