[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / arc600-dsp / divdf3.S

/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
		on behalf of Synopsys Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
   to calculate a := b/x as b*y, with y := 1/x:
   - x is in the range [1..2)
   - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
     Precision is higher for polynoms used to evaluate input with larger
     value.
   - Do one newton-raphson iteration step to double the precision,
     then multiply this with the divisor
	-> more time to decide if dividend is subnormal
     - the worst error propagation is on the side of the value range
       with the least initial defect, thus giving us about 30 bits precision.
      The truncation error for the either is less than 1 + x/2 ulp.
      A 31 bit inverse can be simply calculated by using x with implicit 1
      and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
      with the bare fraction part of x, then add in y0^2 for the implicit
      1 of x.
    - If calculating a 31 bit inverse, the systematic error is less than
      -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
    - If we calculate our seed with a 32 bit fraction, we can archive a
      tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
      only need to take the step to calculate the 2nd stage rest and
      rounding adjust 1/32th of the time.  However, if we use a 20 bit
      fraction for the seed, the negative error can exceed -2 ulp/128, (2)
      thus for a simple add / tst check, we need to do the 2nd stage
      rest calculation/ rounding adjust 1/16th of the time.
      (1): The inexactness of the 32 bit inverse contributes an error in the
      range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
      rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
      x/2 + 1/x <= 1.5 .
      (2): Unless proven otherwise.  I have not actually looked for an
      example where -2 ulp/128 is exceeded, and my calculations indicate
      that the excess, if existent, is less than -1/512 ulp.
    ??? The algorithm is still based on the ARC700 optimized code.
    Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
    results.
 */
#include "../arc-ieee-754.h"
#define mlo acc2
#define mhi acc1
#define mul64(b,c) mullw 0,b,c` machlw 0,b,c
#define mulu64(b,c) mululw 0,b,c` machulw 0,b,c

/* N.B. fp-bit.c does double rounding on denormal numbers.  */
#if 0 /* DEBUG */
	.global __divdf3
	FUNC(__divdf3)
	.balign 4
__divdf3:
	push_s blink
	push_s r2
	push_s r3
	push_s r0
	bl.d __divdf3_c
	push_s r1
	ld_s r2,[sp,12]
	ld_s r3,[sp,8]
	st_s r0,[sp,12]
	st_s r1,[sp,8]
	pop_s r1
	bl.d __divdf3_asm
	pop_s r0
	pop_s r3
	pop_s r2
	pop_s blink
	cmp r0,r2
	cmp.eq r1,r3
	jeq_s [blink]
	and r12,DBL0H,DBL1H
	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
	jeq_s [blink]
	bl abort
	ENDFUNC(__divdf3)
#define __divdf3 __divdf3_asm
#endif /* DEBUG */

	FUNC(__divdf3)
	.balign 4
.L7ff00000:
	.long 0x7ff00000
.Ldivtab:
	.long 0xfc0fffe1
	.long 0xf46ffdfb
	.long 0xed1ffa54
	.long 0xe61ff515
	.long 0xdf7fee75
	.long 0xd91fe680
	.long 0xd2ffdd52
	.long 0xcd1fd30c
	.long 0xc77fc7cd
	.long 0xc21fbbb6
	.long 0xbcefaec0
	.long 0xb7efa100
	.long 0xb32f92bf
	.long 0xae8f83b7
	.long 0xaa2f7467
	.long 0xa5ef6479
	.long 0xa1cf53fa
	.long 0x9ddf433e
	.long 0x9a0f3216
	.long 0x965f2091
	.long 0x92df0f11
	.long 0x8f6efd05
	.long 0x8c1eeacc
	.long 0x88eed876
	.long 0x85dec615
	.long 0x82eeb3b9
	.long 0x800ea10b
	.long 0x7d3e8e0f
	.long 0x7a8e7b3f
	.long 0x77ee6836
	.long 0x756e5576
	.long 0x72fe4293
	.long 0x709e2f93
	.long 0x6e4e1c7f
	.long 0x6c0e095e
	.long 0x69edf6c5
	.long 0x67cde3a5
	.long 0x65cdd125
	.long 0x63cdbe25
	.long 0x61ddab3f
	.long 0x600d991f
	.long 0x5e3d868c
	.long 0x5c6d7384
	.long 0x5abd615f
	.long 0x590d4ecd
	.long 0x576d3c83
	.long 0x55dd2a89
	.long 0x545d18e9
	.long 0x52dd06e9
	.long 0x516cf54e
	.long 0x4ffce356
	.long 0x4e9cd1ce
	.long 0x4d3cbfec
	.long 0x4becae86
	.long 0x4aac9da4
	.long 0x496c8c73
	.long 0x483c7bd3
	.long 0x470c6ae8
	.long 0x45dc59af
	.long 0x44bc4915
	.long 0x43ac3924
	.long 0x428c27fb
	.long 0x418c187a
	.long 0x407c07bd

__divdf3_support: /* This label makes debugger output saner.  */
	.balign 4
.Ldenorm_dbl1:
	brge r6, \
		0x43500000,.Linf_NaN ; large number / denorm -> Inf
	bmsk.f r12,DBL1H,19
	mov.eq r12,DBL1L
	mov.eq DBL1L,0
	sub.eq r7,r7,32
	norm.f r11,r12 ; flag for x/0 -> Inf check
	beq_s .Linf_NaN
	mov.mi r11,0
	add.pl r11,r11,1
	add_s r12,r12,r12
	asl r8,r12,r11
	rsub r12,r11,31
	lsr r12,DBL1L,r12
	tst_s DBL1H,DBL1H
	or r8,r8,r12
	lsr r4,r8,26
	lsr DBL1H,r8,12
	ld.as r4,[r10,r4]
	bxor.mi DBL1H,DBL1H,31
	sub r11,r11,11
	asl DBL1L,DBL1L,r11
	sub r11,r11,1
	mulu64 (r4,r8)
	sub r7,r7,r11
	b.d .Lpast_denorm_dbl1
	asl r7,r7,20

.Linf_NaN:
	tst_s DBL0L,DBL0L ; 0/0 -> NaN
	xor_s DBL1H,DBL1H,DBL0H
	bclr.eq.f DBL0H,DBL0H,31
	bmsk DBL0H,DBL1H,30
	xor_s DBL0H,DBL0H,DBL1H
	sub.eq DBL0H,DBL0H,1
	mov_s DBL0L,0
	j_s.d [blink]
	or DBL0H,DBL0H,r9
	.balign 4
.Lret0_2:
	xor_s DBL1H,DBL1H,DBL0H
	mov_s DBL0L,0
	bmsk DBL0H,DBL1H,30
	j_s.d [blink]
	xor_s DBL0H,DBL0H,DBL1H
	.balign 4
	.global __divdf3
/* N.B. the spacing between divtab and the sub3 to get its address must
   be a multiple of 8.  */
__divdf3:
	asl r8,DBL1H,12
	lsr r4,r8,26
	sub3 r10,pcl,51;(.-.Ldivtab) >> 3
	ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
	ld.as r4,[r10,r4]
	lsr r12,DBL1L,20
	and.f r7,DBL1H,r9
	or r8,r8,r12
	mulu64 (r4,r8)
	beq.d .Ldenorm_dbl1
.Lpast_denorm_dbl1:
	and.f r6,DBL0H,r9
	breq.d r7,r9,.Linf_nan_dbl1
	asl r4,r4,12
	sub r4,r4,mhi
	mululw 0,r4,r4
	machulw r5,r4,r4
	bne.d .Lnormal_dbl0
	lsr r8,r8,1

	.balign 4
.Ldenorm_dbl0:
	bmsk.f r12,DBL0H,19
	; wb stall
	mov.eq r12,DBL0L
	sub.eq r6,r6,32
	norm.f r11,r12 ; flag for 0/x -> 0 check
	brge r7, \
		0x43500000, .Lret0_2 ; denorm/large number -> 0
	beq_s .Lret0_2
	mov.mi r11,0
	add.pl r11,r11,1
	asl r12,r12,r11
	sub r6,r6,r11
	add.f 0,r6,31
	lsr r10,DBL0L,r6
	mov.mi r10,0
	add r6,r6,11+32
	neg.f r11,r6
	asl DBL0L,DBL0L,r11
	mov.pl DBL0L,0
	sub r6,r6,32-1
	b.d .Lpast_denorm_dbl0
	asl r6,r6,20

	.balign 4
.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
	or.f 0,r6,DBL0L
	cmp.ne r6,r9
	not_s DBL0L,DBL1H
	sub_s.ne DBL0L,DBL0L,DBL0L
	tst_s DBL0H,DBL0H
	add_s DBL0H,DBL1H,DBL0L
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31

	.balign 4
.Lnormal_dbl0:
	breq.d r6,r9,.Linf_nan_dbl0
	asl r12,DBL0H,11
	lsr r10,DBL0L,21
.Lpast_denorm_dbl0:
	bset r8,r8,31
	mulu64 (r5,r8)
	add_s r12,r12,r10
	bset r5,r12,31
	cmp r5,r8
	cmp.eq DBL0L,DBL1L
	lsr.cc r5,r5,1
	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
	mululw 0,r5,r4
	machulw r11,r5,r4 ; result fraction highpart
	lsr r8,r8,2 ; u3.29
	add r5,r6, /* wait for immediate */ \
		0x3fe00000
	mulu64 (r11,r8) ; u-28.31
	asl_s DBL1L,DBL1L,9 ; u-29.23:9
	sbc r6,r5,r7
	mov r12,mlo ; u-28.31
	mulu64 (r11,DBL1L) ; mhi: u-28.23:9
	add.cs DBL0L,DBL0L,DBL0L
	asl_s DBL0L,DBL0L,6 ; u-26.25:7
	asl r10,r11,23
	sub_l DBL0L,DBL0L,r12
	lsr r7,r11,9
	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
	mul64 (r5,r4) ; mhi: result fraction lowpart
	xor.f 0,DBL0H,DBL1H
	and DBL0H,r6,r9
	add_s DBL0H,DBL0H,r7
	bclr r12,r9,20 ; 0x7fe00000
	brhs.d r6,r12,.Linf_denorm
	bxor.mi DBL0H,DBL0H,31
	add.f r12,mhi,0x11
	asr r9,r12,5
	sub.mi DBL0H,DBL0H,1
	add.f DBL0L,r9,r10
	tst r12,0x1c
	jne.d [blink]
	add.cs DBL0H,DBL0H,1
        /* work out exact rounding if we fall through here.  */
        /* We know that the exact result cannot be represented in double
           precision.  Find the mid-point between the two nearest
           representable values, multiply with the divisor, and check if
           the result is larger than the dividend.  Since we want to know
	   only the sign bit, it is sufficient to calculate only the
	   highpart of the lower 64 bits.  */
	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
	sub.f DBL0L,DBL0L,1
	asl r12,r9,2 ; u-22.30:2
	sub.cs DBL0H,DBL0H,1
	sub.f r12,r12,2
	mov r10,mlo ; rest before considering r12 in r5 : -r10
	mululw 0,r12,DBL1L
	machulw r7,r12,DBL1L ; mhi: u-51.32
	asl r5,r5,25 ; s-51.7:25
	lsr r10,r10,7 ; u-51.30:2
	mulu64 (r12,r8) ; mlo: u-51.31:1
	sub r5,r5,r10
	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
	bset r7,r7,0 ; make sure that the result is not zero, and that
	sub r5,r5,r7 ; a highpart zero appears negative
	sub.f r5,r5,mlo ; rest msw
	add.pl.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.eq DBL0H,DBL0H,1

.Linf_nan_dbl0:
	tst_s DBL1H,DBL1H
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	.balign 4
.Linf_denorm:
	lsr r12,r6,28
	brlo.d r12,0xc,.Linf
.Ldenorm:
	asr r6,r6,20
	neg r9,r6
	mov_s DBL0H,0
	brhs.d r9,54,.Lret0
	bxor.mi DBL0H,DBL0H,31
	add r12,mhi,1
	and r12,r12,-4
	rsub r7,r6,5
	asr r10,r12,28
	bmsk r4,r12,27
	min r7,r7,31
	asr DBL0L,r4,r7
	add DBL1H,r11,r10
	abs.f r10,r4
	sub.mi r10,r10,1
	add.f r7,r6,32-5
	asl r4,r4,r7
	mov.mi r4,r10
	add.f r10,r6,23
	rsub r7,r6,9
	lsr r7,DBL1H,r7
	asl r10,DBL1H,r10
	or.pnz DBL0H,DBL0H,r7
	or.mi r4,r4,r10
	mov.mi r10,r7
	add.f DBL0L,r10,DBL0L
	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
	bxor.f 0,r4,31
	add.pnz.f DBL0L,DBL0L,1
	add.cs.f DBL0H,DBL0H,1
	jne_s [blink]
	/* Calculation so far was not conclusive; calculate further rest.  */
	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
	asr.f r12,r12,3
	asl r5,r5,25 ; s-51.7:25
	mov r11,mlo ; rest before considering r12 in r5 : -r11
	mulu64 (r12,r8) ; u-51.31:1
	and r9,DBL0L,1 ; tie-breaker: round to even
	lsr r11,r11,7 ; u-51.30:2
	mov DBL1H,mlo ; u-51.31:1
	mulu64 (r12,DBL1L) ; u-51.62:2
	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
	add_s DBL1H,DBL1H,r11
	sub DBL1H,DBL1H,r5 ; -rest msw
	add_s DBL1H,DBL1H,mhi ; -rest msw
	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
	tst_s DBL1H,DBL1H
	cmp.eq mlo,r9
	add.cs.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.cs DBL0H,DBL0H,1

.Lret0:
	/* return +- 0 */
	j_s.d [blink]
	mov_s DBL0L,0
.Linf:
	mov_s DBL0H,r9
	mov_s DBL0L,0
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	ENDFUNC(__divdf3)
Commit	Line	Data
83ffe9cd	1	/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
d38a64b4 JR	2	Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
	3	on behalf of Synopsys Inc.
	4
	5	This file is part of GCC.
	6
	7	GCC is free software; you can redistribute it and/or modify it under
	8	the terms of the GNU General Public License as published by the Free
	9	Software Foundation; either version 3, or (at your option) any later
	10	version.
	11
	12	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	14	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	15	for more details.
	16
	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
	25
	26	/*
	27	to calculate a := b/x as b*y, with y := 1/x:
	28	- x is in the range [1..2)
	29	- calculate 15..18 bit inverse y0 using a table of approximating polynoms.
	30	Precision is higher for polynoms used to evaluate input with larger
	31	value.
	32	- Do one newton-raphson iteration step to double the precision,
	33	then multiply this with the divisor
	34	-> more time to decide if dividend is subnormal
	35	- the worst error propagation is on the side of the value range
	36	with the least initial defect, thus giving us about 30 bits precision.
	37	The truncation error for the either is less than 1 + x/2 ulp.
	38	A 31 bit inverse can be simply calculated by using x with implicit 1
	39	and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
	40	with the bare fraction part of x, then add in y0^2 for the implicit
	41	1 of x.
	42	- If calculating a 31 bit inverse, the systematic error is less than
	43	-1 ulp; likewise, for 32 bit, it is less than -2 ulp.
	44	- If we calculate our seed with a 32 bit fraction, we can archive a
	45	tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
	46	only need to take the step to calculate the 2nd stage rest and
	47	rounding adjust 1/32th of the time. However, if we use a 20 bit
	48	fraction for the seed, the negative error can exceed -2 ulp/128, (2)
	49	thus for a simple add / tst check, we need to do the 2nd stage
	50	rest calculation/ rounding adjust 1/16th of the time.
	51	(1): The inexactness of the 32 bit inverse contributes an error in the
	52	range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
	53	rest contributes an error < +1/x ulp/128 . In the interval [1,2),
	54	x/2 + 1/x <= 1.5 .
	55	(2): Unless proven otherwise. I have not actually looked for an
	56	example where -2 ulp/128 is exceeded, and my calculations indicate
	57	that the excess, if existent, is less than -1/512 ulp.
	58	??? The algorithm is still based on the ARC700 optimized code.
	59	Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
	60	results.
	61	*/
	62	#include "../arc-ieee-754.h"
	63	#define mlo acc2
	64	#define mhi acc1
	65	#define mul64(b,c) mullw 0,b,c` machlw 0,b,c
66	#define mulu64(b,c) mululw 0,b,c` machulw 0,b,c
67
68	/* N.B. fp-bit.c does double rounding on denormal numbers. */
69	#if 0 /* DEBUG */
70	.global __divdf3
71	FUNC(__divdf3)
72	.balign 4
73	__divdf3:
74	push_s blink
75	push_s r2
76	push_s r3
77	push_s r0
78	bl.d __divdf3_c
79	push_s r1
80	ld_s r2,[sp,12]
81	ld_s r3,[sp,8]
82	st_s r0,[sp,12]
83	st_s r1,[sp,8]
84	pop_s r1
85	bl.d __divdf3_asm
86	pop_s r0
87	pop_s r3
88	pop_s r2
89	pop_s blink
90	cmp r0,r2
91	cmp.eq r1,r3
92	jeq_s [blink]
93	and r12,DBL0H,DBL1H
94	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
95	jeq_s [blink]
96	bl abort
97	ENDFUNC(__divdf3)
98	#define __divdf3 __divdf3_asm
99	#endif /* DEBUG */
100
101	FUNC(__divdf3)
102	.balign 4
103	.L7ff00000:
104	.long 0x7ff00000
105	.Ldivtab:
106	.long 0xfc0fffe1
107	.long 0xf46ffdfb
108	.long 0xed1ffa54
109	.long 0xe61ff515
110	.long 0xdf7fee75
111	.long 0xd91fe680
112	.long 0xd2ffdd52
113	.long 0xcd1fd30c
114	.long 0xc77fc7cd
115	.long 0xc21fbbb6
116	.long 0xbcefaec0
117	.long 0xb7efa100
118	.long 0xb32f92bf
119	.long 0xae8f83b7
120	.long 0xaa2f7467
121	.long 0xa5ef6479
122	.long 0xa1cf53fa
123	.long 0x9ddf433e
124	.long 0x9a0f3216
125	.long 0x965f2091
126	.long 0x92df0f11
127	.long 0x8f6efd05
128	.long 0x8c1eeacc
129	.long 0x88eed876
130	.long 0x85dec615
131	.long 0x82eeb3b9
132	.long 0x800ea10b
133	.long 0x7d3e8e0f
134	.long 0x7a8e7b3f
135	.long 0x77ee6836
136	.long 0x756e5576
137	.long 0x72fe4293
138	.long 0x709e2f93
139	.long 0x6e4e1c7f
140	.long 0x6c0e095e
141	.long 0x69edf6c5
142	.long 0x67cde3a5
143	.long 0x65cdd125
144	.long 0x63cdbe25
145	.long 0x61ddab3f
146	.long 0x600d991f
147	.long 0x5e3d868c
148	.long 0x5c6d7384
149	.long 0x5abd615f
150	.long 0x590d4ecd
151	.long 0x576d3c83
152	.long 0x55dd2a89
153	.long 0x545d18e9
154	.long 0x52dd06e9
155	.long 0x516cf54e
156	.long 0x4ffce356
157	.long 0x4e9cd1ce
158	.long 0x4d3cbfec
159	.long 0x4becae86
160	.long 0x4aac9da4
161	.long 0x496c8c73
162	.long 0x483c7bd3
163	.long 0x470c6ae8
164	.long 0x45dc59af
165	.long 0x44bc4915
166	.long 0x43ac3924
167	.long 0x428c27fb
168	.long 0x418c187a
169	.long 0x407c07bd
170
171	__divdf3_support: /* This label makes debugger output saner. */
172	.balign 4
173	.Ldenorm_dbl1:
174	brge r6, \
175	0x43500000,.Linf_NaN ; large number / denorm -> Inf
176	bmsk.f r12,DBL1H,19
177	mov.eq r12,DBL1L
178	mov.eq DBL1L,0
179	sub.eq r7,r7,32
180	norm.f r11,r12 ; flag for x/0 -> Inf check
181	beq_s .Linf_NaN
182	mov.mi r11,0
183	add.pl r11,r11,1
184	add_s r12,r12,r12
185	asl r8,r12,r11
186	rsub r12,r11,31
187	lsr r12,DBL1L,r12
188	tst_s DBL1H,DBL1H
189	or r8,r8,r12
190	lsr r4,r8,26
191	lsr DBL1H,r8,12
192	ld.as r4,[r10,r4]
193	bxor.mi DBL1H,DBL1H,31
194	sub r11,r11,11
195	asl DBL1L,DBL1L,r11
196	sub r11,r11,1
197	mulu64 (r4,r8)
198	sub r7,r7,r11
199	b.d .Lpast_denorm_dbl1
200	asl r7,r7,20
201
202	.Linf_NaN:
203	tst_s DBL0L,DBL0L ; 0/0 -> NaN
204	xor_s DBL1H,DBL1H,DBL0H
205	bclr.eq.f DBL0H,DBL0H,31
206	bmsk DBL0H,DBL1H,30
207	xor_s DBL0H,DBL0H,DBL1H
208	sub.eq DBL0H,DBL0H,1
209	mov_s DBL0L,0
210	j_s.d [blink]
211	or DBL0H,DBL0H,r9
212	.balign 4
213	.Lret0_2:
214	xor_s DBL1H,DBL1H,DBL0H
215	mov_s DBL0L,0
216	bmsk DBL0H,DBL1H,30
217	j_s.d [blink]
218	xor_s DBL0H,DBL0H,DBL1H
219	.balign 4
220	.global __divdf3
221	/* N.B. the spacing between divtab and the sub3 to get its address must
222	be a multiple of 8. */
223	__divdf3:
224	asl r8,DBL1H,12
225	lsr r4,r8,26
226	sub3 r10,pcl,51;(.-.Ldivtab) >> 3
227	ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
228	ld.as r4,[r10,r4]
229	lsr r12,DBL1L,20
230	and.f r7,DBL1H,r9
231	or r8,r8,r12
232	mulu64 (r4,r8)
233	beq.d .Ldenorm_dbl1
234	.Lpast_denorm_dbl1:
235	and.f r6,DBL0H,r9
236	breq.d r7,r9,.Linf_nan_dbl1
237	asl r4,r4,12
238	sub r4,r4,mhi
239	mululw 0,r4,r4
240	machulw r5,r4,r4
241	bne.d .Lnormal_dbl0
242	lsr r8,r8,1
243
244	.balign 4
245	.Ldenorm_dbl0:
246	bmsk.f r12,DBL0H,19
247	; wb stall
248	mov.eq r12,DBL0L
249	sub.eq r6,r6,32
250	norm.f r11,r12 ; flag for 0/x -> 0 check
251	brge r7, \
252	0x43500000, .Lret0_2 ; denorm/large number -> 0
253	beq_s .Lret0_2
254	mov.mi r11,0
255	add.pl r11,r11,1
256	asl r12,r12,r11
257	sub r6,r6,r11
258	add.f 0,r6,31
259	lsr r10,DBL0L,r6
260	mov.mi r10,0
261	add r6,r6,11+32
262	neg.f r11,r6
263	asl DBL0L,DBL0L,r11
264	mov.pl DBL0L,0
265	sub r6,r6,32-1
266	b.d .Lpast_denorm_dbl0
267	asl r6,r6,20
268
269	.balign 4
270	.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
271	or.f 0,r6,DBL0L
272	cmp.ne r6,r9
273	not_s DBL0L,DBL1H
274	sub_s.ne DBL0L,DBL0L,DBL0L
275	tst_s DBL0H,DBL0H
276	add_s DBL0H,DBL1H,DBL0L
277	j_s.d [blink]
278	bxor.mi DBL0H,DBL0H,31
279
280	.balign 4
281	.Lnormal_dbl0:
282	breq.d r6,r9,.Linf_nan_dbl0
283	asl r12,DBL0H,11
284	lsr r10,DBL0L,21
285	.Lpast_denorm_dbl0:
286	bset r8,r8,31
287	mulu64 (r5,r8)
288	add_s r12,r12,r10
289	bset r5,r12,31
290	cmp r5,r8
291	cmp.eq DBL0L,DBL1L
292	lsr.cc r5,r5,1
293	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
294	mululw 0,r5,r4
295	machulw r11,r5,r4 ; result fraction highpart
296	lsr r8,r8,2 ; u3.29
297	add r5,r6, /* wait for immediate */ \
298	0x3fe00000
299	mulu64 (r11,r8) ; u-28.31
300	asl_s DBL1L,DBL1L,9 ; u-29.23:9
301	sbc r6,r5,r7
302	mov r12,mlo ; u-28.31
303	mulu64 (r11,DBL1L) ; mhi: u-28.23:9
304	add.cs DBL0L,DBL0L,DBL0L
305	asl_s DBL0L,DBL0L,6 ; u-26.25:7
306	asl r10,r11,23
307	sub_l DBL0L,DBL0L,r12
308	lsr r7,r11,9
309	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
310	mul64 (r5,r4) ; mhi: result fraction lowpart
311	xor.f 0,DBL0H,DBL1H
312	and DBL0H,r6,r9
313	add_s DBL0H,DBL0H,r7
314	bclr r12,r9,20 ; 0x7fe00000
315	brhs.d r6,r12,.Linf_denorm
316	bxor.mi DBL0H,DBL0H,31
317	add.f r12,mhi,0x11
318	asr r9,r12,5
319	sub.mi DBL0H,DBL0H,1
320	add.f DBL0L,r9,r10
321	tst r12,0x1c
322	jne.d [blink]
323	add.cs DBL0H,DBL0H,1
324	/* work out exact rounding if we fall through here. */
325	/* We know that the exact result cannot be represented in double
326	precision. Find the mid-point between the two nearest
327	representable values, multiply with the divisor, and check if
328	the result is larger than the dividend. Since we want to know
329	only the sign bit, it is sufficient to calculate only the
330	highpart of the lower 64 bits. */
331	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
332	sub.f DBL0L,DBL0L,1
333	asl r12,r9,2 ; u-22.30:2
334	sub.cs DBL0H,DBL0H,1
335	sub.f r12,r12,2
336	mov r10,mlo ; rest before considering r12 in r5 : -r10
337	mululw 0,r12,DBL1L
338	machulw r7,r12,DBL1L ; mhi: u-51.32
339	asl r5,r5,25 ; s-51.7:25
340	lsr r10,r10,7 ; u-51.30:2
341	mulu64 (r12,r8) ; mlo: u-51.31:1
342	sub r5,r5,r10
343	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
344	bset r7,r7,0 ; make sure that the result is not zero, and that
345	sub r5,r5,r7 ; a highpart zero appears negative
346	sub.f r5,r5,mlo ; rest msw
347	add.pl.f DBL0L,DBL0L,1
348	j_s.d [blink]
349	add.eq DBL0H,DBL0H,1
350
351	.Linf_nan_dbl0:
352	tst_s DBL1H,DBL1H
353	j_s.d [blink]
354	bxor.mi DBL0H,DBL0H,31
355	.balign 4
356	.Linf_denorm:
357	lsr r12,r6,28
358	brlo.d r12,0xc,.Linf
359	.Ldenorm:
360	asr r6,r6,20
361	neg r9,r6
362	mov_s DBL0H,0
363	brhs.d r9,54,.Lret0
364	bxor.mi DBL0H,DBL0H,31
365	add r12,mhi,1
366	and r12,r12,-4
367	rsub r7,r6,5
368	asr r10,r12,28
369	bmsk r4,r12,27
370	min r7,r7,31
371	asr DBL0L,r4,r7
372	add DBL1H,r11,r10
373	abs.f r10,r4
374	sub.mi r10,r10,1
375	add.f r7,r6,32-5
376	asl r4,r4,r7
377	mov.mi r4,r10
378	add.f r10,r6,23
379	rsub r7,r6,9
380	lsr r7,DBL1H,r7
381	asl r10,DBL1H,r10
382	or.pnz DBL0H,DBL0H,r7
383	or.mi r4,r4,r10
384	mov.mi r10,r7
385	add.f DBL0L,r10,DBL0L
386	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
387	bxor.f 0,r4,31
388	add.pnz.f DBL0L,DBL0L,1
389	add.cs.f DBL0H,DBL0H,1
390	jne_s [blink]
391	/* Calculation so far was not conclusive; calculate further rest. */
392	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
393	asr.f r12,r12,3
394	asl r5,r5,25 ; s-51.7:25
395	mov r11,mlo ; rest before considering r12 in r5 : -r11
396	mulu64 (r12,r8) ; u-51.31:1
397	and r9,DBL0L,1 ; tie-breaker: round to even
398	lsr r11,r11,7 ; u-51.30:2
399	mov DBL1H,mlo ; u-51.31:1
400	mulu64 (r12,DBL1L) ; u-51.62:2
401	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
402	add_s DBL1H,DBL1H,r11
403	sub DBL1H,DBL1H,r5 ; -rest msw
404	add_s DBL1H,DBL1H,mhi ; -rest msw
405	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
406	tst_s DBL1H,DBL1H
407	cmp.eq mlo,r9
408	add.cs.f DBL0L,DBL0L,1
409	j_s.d [blink]
410	add.cs DBL0H,DBL0H,1
411
412	.Lret0:
413	/* return +- 0 */
414	j_s.d [blink]
415	mov_s DBL0L,0
416	.Linf:
417	mov_s DBL0H,r9
418	mov_s DBL0L,0
419	j_s.d [blink]
420	bxor.mi DBL0H,DBL0H,31
421	ENDFUNC(__divdf3)