[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / divdf3.S

/* Copyright (C) 2008-2019 Free Software Foundation, Inc.
   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
		on behalf of Synopsys Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
   to calculate a := b/x as b*y, with y := 1/x:
   - x is in the range [1..2)
   - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
     Precision is higher for polynoms used to evaluate input with larger
     value.
   - Do one newton-raphson iteration step to double the precision,
     then multiply this with the divisor
	-> more time to decide if dividend is subnormal
     - the worst error propagation is on the side of the value range
       with the least initial defect, thus giving us about 30 bits precision.
      The truncation error for the either is less than 1 + x/2 ulp.
      A 31 bit inverse can be simply calculated by using x with implicit 1
      and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
      with the bare fraction part of x, then add in y0^2 for the implicit
      1 of x.
    - If calculating a 31 bit inverse, the systematic error is less than
      -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
    - If we calculate our seed with a 32 bit fraction, we can archive a
      tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
      only need to take the step to calculate the 2nd stage rest and
      rounding adjust 1/32th of the time.  However, if we use a 20 bit
      fraction for the seed, the negative error can exceed -2 ulp/128, (2)
      thus for a simple add / tst check, we need to do the 2nd stage
      rest calculation/ rounding adjust 1/16th of the time.
      (1): The inexactness of the 32 bit inverse contributes an error in the
      range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
      rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
      x/2 + 1/x <= 1.5 .
      (2): Unless proven otherwise.  I have not actually looked for an
      example where -2 ulp/128 is exceeded, and my calculations indicate
      that the excess, if existent, is less than -1/512 ulp.
 */
#include "arc-ieee-754.h"

/* N.B. fp-bit.c does double rounding on denormal numbers.  */
#if 0 /* DEBUG */
	.global __divdf3
	FUNC(__divdf3)
	.balign 4
__divdf3:
	push_s blink
	push_s r2
	push_s r3
	push_s r0
	bl.d __divdf3_c
	push_s r1
	ld_s r2,[sp,12]
	ld_s r3,[sp,8]
	st_s r0,[sp,12]
	st_s r1,[sp,8]
	pop_s r1
	bl.d __divdf3_asm
	pop_s r0
	pop_s r3
	pop_s r2
	pop_s blink
	cmp r0,r2
	cmp.eq r1,r3
	jeq_s [blink]
	and r12,DBL0H,DBL1H
	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
	jeq_s [blink]
	bl abort
	ENDFUNC(__divdf3)
#define __divdf3 __divdf3_asm
#endif /* DEBUG */

	FUNC(__divdf3)
__divdf3_support: /* This label makes debugger output saner.  */
	.balign 4
.Ldenorm_dbl1:
	brge r6, \
		0x43500000,.Linf_NaN ; large number / denorm -> Inf
	bmsk.f r12,DBL1H,19
	mov.eq r12,DBL1L
	mov.eq DBL1L,0
	sub.eq r7,r7,32
	norm.f r11,r12 ; flag for x/0 -> Inf check
	beq_s .Linf_NaN
	mov.mi r11,0
	add.pl r11,r11,1
	add_s r12,r12,r12
	asl r8,r12,r11
	rsub r12,r11,31
	lsr r12,DBL1L,r12
	tst_s DBL1H,DBL1H
	or r8,r8,r12
	lsr r4,r8,26
	lsr DBL1H,r8,12
	ld.as r4,[r10,r4]
	bxor.mi DBL1H,DBL1H,31
	sub r11,r11,11
	asl DBL1L,DBL1L,r11
	sub r11,r11,1
	MPYHU r5,r4,r8
	sub r7,r7,r11
	asl r4,r4,12
	b.d .Lpast_denorm_dbl1
	asl r7,r7,20
	; wb stall

	.balign 4
.Ldenorm_dbl0:
	bmsk.f r12,DBL0H,19
	; wb stall
	mov.eq r12,DBL0L
	sub.eq r6,r6,32
	norm.f r11,r12 ; flag for 0/x -> 0 check
	brge r7, \
		0x43500000, .Lret0_NaN ; denorm/large number -> 0
	beq_s .Lret0_NaN
	mov.mi r11,0
	add.pl r11,r11,1
	asl r12,r12,r11
	sub r6,r6,r11
	add.f 0,r6,31
	lsr r10,DBL0L,r6
	mov.mi r10,0
	add r6,r6,11+32
	neg.f r11,r6
	asl DBL0L,DBL0L,r11
	mov.pl DBL0L,0
	sub r6,r6,32-1
	b.d .Lpast_denorm_dbl0
	asl r6,r6,20

.Linf_NaN:
	tst_s DBL0L,DBL0L ; 0/0 -> NaN
	xor_s DBL1H,DBL1H,DBL0H
	bclr.eq.f DBL0H,DBL0H,31
	bmsk DBL0H,DBL1H,30
	xor_s DBL0H,DBL0H,DBL1H
	sub.eq DBL0H,DBL0H,1
	mov_s DBL0L,0
	j_s.d [blink]
	or DBL0H,DBL0H,r9
	.balign 4
.Lret0_NaN:
	xor_s DBL1H,DBL1H,DBL0H
	cmp_s r12,r9
	mov_s DBL0L,0
	bmsk DBL0H,DBL1H,30
	xor_s DBL0H,DBL0H,DBL1H
	j_s.d [blink]
	sub.hi DBL0H,DBL0H,1
.Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
	not_s DBL0L,DBL1H
	cmp r6,r9
	sub_s.ne DBL0L,DBL0L,DBL0L
	tst_s DBL0H,DBL0H
	add_s DBL0H,DBL1H,DBL0L
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
.Linf_nan_dbl0:
	tst_s DBL1H,DBL1H
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31
	.balign 4
	.global __divdf3
/* N.B. the spacing between divtab and the add3 to get its address must
   be a multiple of 8.  */
__divdf3:
	asl r8,DBL1H,12
	lsr r12,DBL1L,20
	lsr r4,r8,26
#if defined (__ARCHS__) || defined (__ARCEM__)
	add3 r10,pcl,60 ; (.Ldivtab-.) >> 3
#else
	add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
#endif
	ld.as r4,[r10,r4]
#if defined (__ARCHS__) || defined (__ARCEM__)
	ld.as r9,[pcl,182]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
#else
	ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
#endif
	or r8,r8,r12
	MPYHU r5,r4,r8
	and.f r7,DBL1H,r9
	asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
	beq.d .Ldenorm_dbl1
	and r6,DBL0H,r9
.Lpast_denorm_dbl1: ; wb stall
	sub r4,r4,r5
	MPYHU r5,r4,r4
	breq.d r6,0,.Ldenorm_dbl0
	lsr r8,r8,1
	asl r12,DBL0H,11
	lsr r10,DBL0L,21
.Lpast_denorm_dbl0: ; wb stall
	bset r8,r8,31
	MPYHU r11,r5,r8
	add_s r12,r12,r10
	bset r5,r12,31
	cmp r5,r8
	cmp.eq DBL0L,DBL1L
	; wb stall
	lsr.cc r5,r5,1
	sub r4,r4,r11 ; u1.31 inverse, about 30 bit
	MPYHU r11,r5,r4 ; result fraction highpart
	breq r7,r9,.Linf_nan_dbl1
	lsr r8,r8,2 ; u3.29
	add r5,r6, /* wait for immediate /  XMAC wb stall */ \
		0x3fe00000
	; wb stall (not for XMAC)
	breq r6,r9,.Linf_nan_dbl0
	mpyu r12,r11,r8 ; u-28.31
	asl_s DBL1L,DBL1L,9 ; u-29.23:9
	sbc r6,r5,r7
	; resource conflict (not for XMAC)
	MPYHU r5,r11,DBL1L ; u-28.23:9
	add.cs DBL0L,DBL0L,DBL0L
	asl_s DBL0L,DBL0L,6 ; u-26.25:7
	asl r10,r11,23
	sub_l DBL0L,DBL0L,r12
	; wb stall (before 'and' for XMAC)
	lsr r7,r11,9
	sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
	MPYH r12,r5,r4 ; result fraction lowpart
	xor.f 0,DBL0H,DBL1H
	and DBL0H,r6,r9
	add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
	bxor.mi DBL0H,DBL0H,31
	brhs r6, /*  wb stall / wait for immediate */ \
		0x7fe00000,.Linf_denorm
	add.f r12,r12,0x11
	asr r9,r12,5
	sub.mi DBL0H,DBL0H,1
	add.f DBL0L,r9,r10
	tst r12,0x1c
	jne.d [blink]
	add.cs DBL0H,DBL0H,1
        /* work out exact rounding if we fall through here.  */
        /* We know that the exact result cannot be represented in double
           precision.  Find the mid-point between the two nearest
           representable values, multiply with the divisor, and check if
           the result is larger than the dividend.  Since we want to know
	   only the sign bit, it is sufficient to calculate only the
	   highpart of the lower 64 bits.  */
	sub.f DBL0L,DBL0L,1
	asl r12,r9,2 ; u-22.30:2
	mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
	sub.cs DBL0H,DBL0H,1
	sub.f r12,r12,2
	; resource conflict (not for XMAC)
	MPYHU r7,r12,DBL1L ; u-51.32
	asl r5,r5,25 ; s-51.7:25
	lsr r10,r10,7 ; u-51.30:2
	; resource conflict (not for XMAC)
	; resource conflict (not for XMAC)
	mpyu r9,r12,r8 ; u-51.31:1
	sub r5,r5,r10
	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
	bset r7,r7,0 ; make sure that the result is not zero, and that
	; wb stall (one earlier for XMAC)
	sub r5,r5,r7 ; a highpart zero appears negative
	sub.f r5,r5,r9 ; rest msw
	add.pl.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.eq DBL0H,DBL0H,1

	.balign 4
.Linf_denorm:
	brlo r6,0xc0000000,.Linf
.Ldenorm:
	asr r6,r6,20
	neg r9,r6
	mov_s DBL0H,0
	brhs.d r9,54,.Lret0
	bxor.mi DBL0H,DBL0H,31
	add_l r12,r12,1
	and r12,r12,-4
	rsub r7,r6,5
	asr r10,r12,28
	bmsk r4,r12,27
#if defined (__ARCHS__) || defined (__ARCEM__)
	min  r7, r7, 31
	asr  DBL0L, r4, r7
#else
	asrs DBL0L,r4,r7
#endif
	add DBL1H,r11,r10
#if defined (__ARCHS__) || defined (__ARCEM__)
	abs.f r10, r4
	sub.mi r10, r10, 1
#endif
	add.f r7,r6,32-5
#ifdef __ARC700__
	abss r10,r4
#endif
	asl r4,r4,r7
	mov.mi r4,r10
	add.f r10,r6,23
	rsub r7,r6,9
	lsr r7,DBL1H,r7
	asl r10,DBL1H,r10
	or.pnz DBL0H,DBL0H,r7
	or.mi r4,r4,r10
	mov.mi r10,r7
	add.f DBL0L,r10,DBL0L
	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
	bxor.f 0,r4,31
	add.pnz.f DBL0L,DBL0L,1
	add.cs.f DBL0H,DBL0H,1
	jne_l [blink]
	/* Calculation so far was not conclusive; calculate further rest.  */
	mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
	asr.f r12,r12,3
	asl r5,r5,25 ; s-51.7:25
	; resource conflict (not for XMAC)
	mpyu DBL1H,r12,r8 ; u-51.31:1
	and r9,DBL0L,1 ; tie-breaker: round to even
	lsr r11,r11,7 ; u-51.30:2
	; resource conflict (not for XMAC)
	MPYHU r8,r12,DBL1L ; u-51.32
	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
	add_s DBL1H,DBL1H,r11
	; resource conflict (not for XMAC)
	; resource conflict (not for XMAC)
	mpyu r12,r12,DBL1L ; u-83.30:2
	sub DBL1H,DBL1H,r5 ; -rest msw
	add_s DBL1H,DBL1H,r8 ; -rest msw
	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
	; wb stall (XMAC: Before add.f)
	tst_s DBL1H,DBL1H
	cmp.eq r12,r9
	add.cs.f DBL0L,DBL0L,1
	j_s.d [blink]
	add.cs DBL0H,DBL0H,1

.Lret0:
	/* return +- 0 */
	j_s.d [blink]
	mov_s DBL0L,0
.Linf:
	mov_s DBL0H,r9
	mov_s DBL0L,0
	j_s.d [blink]
	bxor.mi DBL0H,DBL0H,31

	.balign 4
.Ldivtab:
	.long 0xfc0fffe1
	.long 0xf46ffdfb
	.long 0xed1ffa54
	.long 0xe61ff515
	.long 0xdf7fee75
	.long 0xd91fe680
	.long 0xd2ffdd52
	.long 0xcd1fd30c
	.long 0xc77fc7cd
	.long 0xc21fbbb6
	.long 0xbcefaec0
	.long 0xb7efa100
	.long 0xb32f92bf
	.long 0xae8f83b7
	.long 0xaa2f7467
	.long 0xa5ef6479
	.long 0xa1cf53fa
	.long 0x9ddf433e
	.long 0x9a0f3216
	.long 0x965f2091
	.long 0x92df0f11
	.long 0x8f6efd05
	.long 0x8c1eeacc
	.long 0x88eed876
	.long 0x85dec615
	.long 0x82eeb3b9
	.long 0x800ea10b
	.long 0x7d3e8e0f
	.long 0x7a8e7b3f
	.long 0x77ee6836
	.long 0x756e5576
	.long 0x72fe4293
	.long 0x709e2f93
	.long 0x6e4e1c7f
	.long 0x6c0e095e
	.long 0x69edf6c5
	.long 0x67cde3a5
	.long 0x65cdd125
	.long 0x63cdbe25
	.long 0x61ddab3f
	.long 0x600d991f
	.long 0x5e3d868c
	.long 0x5c6d7384
	.long 0x5abd615f
	.long 0x590d4ecd
	.long 0x576d3c83
	.long 0x55dd2a89
	.long 0x545d18e9
	.long 0x52dd06e9
	.long 0x516cf54e
	.long 0x4ffce356
	.long 0x4e9cd1ce
	.long 0x4d3cbfec
	.long 0x4becae86
	.long 0x4aac9da4
	.long 0x496c8c73
	.long 0x483c7bd3
	.long 0x470c6ae8
	.long 0x45dc59af
	.long 0x44bc4915
	.long 0x43ac3924
	.long 0x428c27fb
	.long 0x418c187a
	.long 0x407c07bd
.L7ff00000:
	.long 0x7ff00000
	ENDFUNC(__divdf3)
Commit	Line	Data
fbd26352	1	/* Copyright (C) 2008-2019 Free Software Foundation, Inc.
8eaaaea3	2	Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
	3	on behalf of Synopsys Inc.
	4
	5	This file is part of GCC.
	6
	7	GCC is free software; you can redistribute it and/or modify it under
	8	the terms of the GNU General Public License as published by the Free
	9	Software Foundation; either version 3, or (at your option) any later
	10	version.
	11
	12	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	14	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	15	for more details.
	16
	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
	25
	26	/*
	27	to calculate a := b/x as b*y, with y := 1/x:
	28	- x is in the range [1..2)
	29	- calculate 15..18 bit inverse y0 using a table of approximating polynoms.
	30	Precision is higher for polynoms used to evaluate input with larger
	31	value.
	32	- Do one newton-raphson iteration step to double the precision,
	33	then multiply this with the divisor
	34	-> more time to decide if dividend is subnormal
	35	- the worst error propagation is on the side of the value range
	36	with the least initial defect, thus giving us about 30 bits precision.
	37	The truncation error for the either is less than 1 + x/2 ulp.
	38	A 31 bit inverse can be simply calculated by using x with implicit 1
	39	and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
	40	with the bare fraction part of x, then add in y0^2 for the implicit
	41	1 of x.
	42	- If calculating a 31 bit inverse, the systematic error is less than
	43	-1 ulp; likewise, for 32 bit, it is less than -2 ulp.
	44	- If we calculate our seed with a 32 bit fraction, we can archive a
	45	tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
	46	only need to take the step to calculate the 2nd stage rest and
	47	rounding adjust 1/32th of the time. However, if we use a 20 bit
	48	fraction for the seed, the negative error can exceed -2 ulp/128, (2)
	49	thus for a simple add / tst check, we need to do the 2nd stage
	50	rest calculation/ rounding adjust 1/16th of the time.
	51	(1): The inexactness of the 32 bit inverse contributes an error in the
	52	range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
	53	rest contributes an error < +1/x ulp/128 . In the interval [1,2),
	54	x/2 + 1/x <= 1.5 .
	55	(2): Unless proven otherwise. I have not actually looked for an
	56	example where -2 ulp/128 is exceeded, and my calculations indicate
	57	that the excess, if existent, is less than -1/512 ulp.
	58	*/
	59	#include "arc-ieee-754.h"
	60
	61	/* N.B. fp-bit.c does double rounding on denormal numbers. */
	62	#if 0 /* DEBUG */
	63	.global __divdf3
	64	FUNC(__divdf3)
	65	.balign 4
66	__divdf3:
67	push_s blink
68	push_s r2
69	push_s r3
70	push_s r0
71	bl.d __divdf3_c
72	push_s r1
73	ld_s r2,[sp,12]
74	ld_s r3,[sp,8]
75	st_s r0,[sp,12]
76	st_s r1,[sp,8]
77	pop_s r1
78	bl.d __divdf3_asm
79	pop_s r0
80	pop_s r3
81	pop_s r2
82	pop_s blink
83	cmp r0,r2
84	cmp.eq r1,r3
85	jeq_s [blink]
86	and r12,DBL0H,DBL1H
87	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
88	jeq_s [blink]
89	bl abort
90	ENDFUNC(__divdf3)
91	#define __divdf3 __divdf3_asm
92	#endif /* DEBUG */
93
94	FUNC(__divdf3)
95	__divdf3_support: /* This label makes debugger output saner. */
96	.balign 4
97	.Ldenorm_dbl1:
98	brge r6, \
99	0x43500000,.Linf_NaN ; large number / denorm -> Inf
100	bmsk.f r12,DBL1H,19
101	mov.eq r12,DBL1L
102	mov.eq DBL1L,0
103	sub.eq r7,r7,32
104	norm.f r11,r12 ; flag for x/0 -> Inf check
105	beq_s .Linf_NaN
106	mov.mi r11,0
107	add.pl r11,r11,1
108	add_s r12,r12,r12
109	asl r8,r12,r11
110	rsub r12,r11,31
111	lsr r12,DBL1L,r12
112	tst_s DBL1H,DBL1H
113	or r8,r8,r12
114	lsr r4,r8,26
115	lsr DBL1H,r8,12
116	ld.as r4,[r10,r4]
117	bxor.mi DBL1H,DBL1H,31
118	sub r11,r11,11
119	asl DBL1L,DBL1L,r11
120	sub r11,r11,1
0c4d7986	121	MPYHU r5,r4,r8
8eaaaea3	122	sub r7,r7,r11
	123	asl r4,r4,12
	124	b.d .Lpast_denorm_dbl1
	125	asl r7,r7,20
	126	; wb stall
	127
	128	.balign 4
	129	.Ldenorm_dbl0:
	130	bmsk.f r12,DBL0H,19
	131	; wb stall
	132	mov.eq r12,DBL0L
	133	sub.eq r6,r6,32
	134	norm.f r11,r12 ; flag for 0/x -> 0 check
	135	brge r7, \
	136	0x43500000, .Lret0_NaN ; denorm/large number -> 0
	137	beq_s .Lret0_NaN
	138	mov.mi r11,0
	139	add.pl r11,r11,1
	140	asl r12,r12,r11
	141	sub r6,r6,r11
	142	add.f 0,r6,31
	143	lsr r10,DBL0L,r6
	144	mov.mi r10,0
	145	add r6,r6,11+32
	146	neg.f r11,r6
	147	asl DBL0L,DBL0L,r11
	148	mov.pl DBL0L,0
	149	sub r6,r6,32-1
	150	b.d .Lpast_denorm_dbl0
	151	asl r6,r6,20
	152
	153	.Linf_NaN:
	154	tst_s DBL0L,DBL0L ; 0/0 -> NaN
	155	xor_s DBL1H,DBL1H,DBL0H
	156	bclr.eq.f DBL0H,DBL0H,31
	157	bmsk DBL0H,DBL1H,30
	158	xor_s DBL0H,DBL0H,DBL1H
	159	sub.eq DBL0H,DBL0H,1
	160	mov_s DBL0L,0
	161	j_s.d [blink]
	162	or DBL0H,DBL0H,r9
	163	.balign 4
	164	.Lret0_NaN:
	165	xor_s DBL1H,DBL1H,DBL0H
	166	cmp_s r12,r9
	167	mov_s DBL0L,0
	168	bmsk DBL0H,DBL1H,30
	169	xor_s DBL0H,DBL0H,DBL1H
	170	j_s.d [blink]
	171	sub.hi DBL0H,DBL0H,1
	172	.Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
	173	not_s DBL0L,DBL1H
	174	cmp r6,r9
	175	sub_s.ne DBL0L,DBL0L,DBL0L
	176	tst_s DBL0H,DBL0H
	177	add_s DBL0H,DBL1H,DBL0L
	178	j_s.d [blink]
	179	bxor.mi DBL0H,DBL0H,31
	180	.Linf_nan_dbl0:
	181	tst_s DBL1H,DBL1H
	182	j_s.d [blink]
	183	bxor.mi DBL0H,DBL0H,31
	184	.balign 4
	185	.global __divdf3
186	/* N.B. the spacing between divtab and the add3 to get its address must
187	be a multiple of 8. */
188	__divdf3:
189	asl r8,DBL1H,12
190	lsr r12,DBL1L,20
191	lsr r4,r8,26
e69e67d4	192	#if defined (__ARCHS__) \|\| defined (__ARCEM__)
0c4d7986	193	add3 r10,pcl,60 ; (.Ldivtab-.) >> 3
0c4d7986	194	#else
8eaaaea3	195	add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
0c4d7986	196	#endif
8eaaaea3	197	ld.as r4,[r10,r4]
e69e67d4	198	#if defined (__ARCHS__) \|\| defined (__ARCEM__)
0c4d7986	199	ld.as r9,[pcl,182]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
0c4d7986	200	#else
8eaaaea3	201	ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
0c4d7986	202	#endif
8eaaaea3	203	or r8,r8,r12
0c4d7986	204	MPYHU r5,r4,r8
8eaaaea3	205	and.f r7,DBL1H,r9
	206	asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
	207	beq.d .Ldenorm_dbl1
	208	and r6,DBL0H,r9
	209	.Lpast_denorm_dbl1: ; wb stall
	210	sub r4,r4,r5
0c4d7986	211	MPYHU r5,r4,r4
8eaaaea3	212	breq.d r6,0,.Ldenorm_dbl0
	213	lsr r8,r8,1
	214	asl r12,DBL0H,11
	215	lsr r10,DBL0L,21
	216	.Lpast_denorm_dbl0: ; wb stall
	217	bset r8,r8,31
0c4d7986	218	MPYHU r11,r5,r8
8eaaaea3	219	add_s r12,r12,r10
	220	bset r5,r12,31
	221	cmp r5,r8
	222	cmp.eq DBL0L,DBL1L
	223	; wb stall
	224	lsr.cc r5,r5,1
	225	sub r4,r4,r11 ; u1.31 inverse, about 30 bit
0c4d7986	226	MPYHU r11,r5,r4 ; result fraction highpart
8eaaaea3	227	breq r7,r9,.Linf_nan_dbl1
	228	lsr r8,r8,2 ; u3.29
	229	add r5,r6, /* wait for immediate / XMAC wb stall */ \
	230	0x3fe00000
	231	; wb stall (not for XMAC)
	232	breq r6,r9,.Linf_nan_dbl0
	233	mpyu r12,r11,r8 ; u-28.31
	234	asl_s DBL1L,DBL1L,9 ; u-29.23:9
	235	sbc r6,r5,r7
	236	; resource conflict (not for XMAC)
0c4d7986	237	MPYHU r5,r11,DBL1L ; u-28.23:9
8eaaaea3	238	add.cs DBL0L,DBL0L,DBL0L
	239	asl_s DBL0L,DBL0L,6 ; u-26.25:7
	240	asl r10,r11,23
	241	sub_l DBL0L,DBL0L,r12
	242	; wb stall (before 'and' for XMAC)
	243	lsr r7,r11,9
	244	sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
0c4d7986	245	MPYH r12,r5,r4 ; result fraction lowpart
8eaaaea3	246	xor.f 0,DBL0H,DBL1H
	247	and DBL0H,r6,r9
	248	add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
	249	bxor.mi DBL0H,DBL0H,31
	250	brhs r6, /* wb stall / wait for immediate */ \
	251	0x7fe00000,.Linf_denorm
	252	add.f r12,r12,0x11
	253	asr r9,r12,5
	254	sub.mi DBL0H,DBL0H,1
	255	add.f DBL0L,r9,r10
	256	tst r12,0x1c
	257	jne.d [blink]
	258	add.cs DBL0H,DBL0H,1
	259	/* work out exact rounding if we fall through here. */
	260	/* We know that the exact result cannot be represented in double
	261	precision. Find the mid-point between the two nearest
	262	representable values, multiply with the divisor, and check if
	263	the result is larger than the dividend. Since we want to know
	264	only the sign bit, it is sufficient to calculate only the
	265	highpart of the lower 64 bits. */
	266	sub.f DBL0L,DBL0L,1
	267	asl r12,r9,2 ; u-22.30:2
	268	mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
	269	sub.cs DBL0H,DBL0H,1
	270	sub.f r12,r12,2
	271	; resource conflict (not for XMAC)
0c4d7986	272	MPYHU r7,r12,DBL1L ; u-51.32
8eaaaea3	273	asl r5,r5,25 ; s-51.7:25
	274	lsr r10,r10,7 ; u-51.30:2
	275	; resource conflict (not for XMAC)
	276	; resource conflict (not for XMAC)
	277	mpyu r9,r12,r8 ; u-51.31:1
	278	sub r5,r5,r10
	279	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
	280	bset r7,r7,0 ; make sure that the result is not zero, and that
	281	; wb stall (one earlier for XMAC)
	282	sub r5,r5,r7 ; a highpart zero appears negative
	283	sub.f r5,r5,r9 ; rest msw
	284	add.pl.f DBL0L,DBL0L,1
	285	j_s.d [blink]
	286	add.eq DBL0H,DBL0H,1
	287
	288	.balign 4
	289	.Linf_denorm:
	290	brlo r6,0xc0000000,.Linf
	291	.Ldenorm:
	292	asr r6,r6,20
	293	neg r9,r6
	294	mov_s DBL0H,0
	295	brhs.d r9,54,.Lret0
	296	bxor.mi DBL0H,DBL0H,31
	297	add_l r12,r12,1
	298	and r12,r12,-4
	299	rsub r7,r6,5
	300	asr r10,r12,28
	301	bmsk r4,r12,27
e69e67d4	302	#if defined (__ARCHS__) \|\| defined (__ARCEM__)
0c4d7986	303	min r7, r7, 31
	304	asr DBL0L, r4, r7
	305	#else
8eaaaea3	306	asrs DBL0L,r4,r7
0c4d7986	307	#endif
8eaaaea3	308	add DBL1H,r11,r10
e69e67d4	309	#if defined (__ARCHS__) \|\| defined (__ARCEM__)
0c4d7986	310	abs.f r10, r4
	311	sub.mi r10, r10, 1
	312	#endif
8eaaaea3	313	add.f r7,r6,32-5
0c4d7986	314	#ifdef __ARC700__
8eaaaea3	315	abss r10,r4
0c4d7986	316	#endif
8eaaaea3	317	asl r4,r4,r7
	318	mov.mi r4,r10
	319	add.f r10,r6,23
	320	rsub r7,r6,9
	321	lsr r7,DBL1H,r7
	322	asl r10,DBL1H,r10
	323	or.pnz DBL0H,DBL0H,r7
	324	or.mi r4,r4,r10
	325	mov.mi r10,r7
	326	add.f DBL0L,r10,DBL0L
	327	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
	328	bxor.f 0,r4,31
	329	add.pnz.f DBL0L,DBL0L,1
	330	add.cs.f DBL0H,DBL0H,1
	331	jne_l [blink]
	332	/* Calculation so far was not conclusive; calculate further rest. */
	333	mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
	334	asr.f r12,r12,3
	335	asl r5,r5,25 ; s-51.7:25
	336	; resource conflict (not for XMAC)
	337	mpyu DBL1H,r12,r8 ; u-51.31:1
	338	and r9,DBL0L,1 ; tie-breaker: round to even
	339	lsr r11,r11,7 ; u-51.30:2
	340	; resource conflict (not for XMAC)
0c4d7986	341	MPYHU r8,r12,DBL1L ; u-51.32
8eaaaea3	342	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
	343	add_s DBL1H,DBL1H,r11
	344	; resource conflict (not for XMAC)
	345	; resource conflict (not for XMAC)
	346	mpyu r12,r12,DBL1L ; u-83.30:2
	347	sub DBL1H,DBL1H,r5 ; -rest msw
	348	add_s DBL1H,DBL1H,r8 ; -rest msw
	349	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
	350	; wb stall (XMAC: Before add.f)
	351	tst_s DBL1H,DBL1H
	352	cmp.eq r12,r9
	353	add.cs.f DBL0L,DBL0L,1
	354	j_s.d [blink]
	355	add.cs DBL0H,DBL0H,1
	356
	357	.Lret0:
	358	/* return +- 0 */
	359	j_s.d [blink]
	360	mov_s DBL0L,0
	361	.Linf:
	362	mov_s DBL0H,r9
	363	mov_s DBL0L,0
	364	j_s.d [blink]
	365	bxor.mi DBL0H,DBL0H,31
	366
	367	.balign 4
	368	.Ldivtab:
	369	.long 0xfc0fffe1
	370	.long 0xf46ffdfb
	371	.long 0xed1ffa54
	372	.long 0xe61ff515
	373	.long 0xdf7fee75
	374	.long 0xd91fe680
	375	.long 0xd2ffdd52
	376	.long 0xcd1fd30c
	377	.long 0xc77fc7cd
	378	.long 0xc21fbbb6
	379	.long 0xbcefaec0
	380	.long 0xb7efa100
	381	.long 0xb32f92bf
	382	.long 0xae8f83b7
	383	.long 0xaa2f7467
	384	.long 0xa5ef6479
	385	.long 0xa1cf53fa
	386	.long 0x9ddf433e
	387	.long 0x9a0f3216
	388	.long 0x965f2091
	389	.long 0x92df0f11
	390	.long 0x8f6efd05
	391	.long 0x8c1eeacc
	392	.long 0x88eed876
	393	.long 0x85dec615
	394	.long 0x82eeb3b9
	395	.long 0x800ea10b
	396	.long 0x7d3e8e0f
	397	.long 0x7a8e7b3f
	398	.long 0x77ee6836
	399	.long 0x756e5576
	400	.long 0x72fe4293
	401	.long 0x709e2f93
	402	.long 0x6e4e1c7f
	403	.long 0x6c0e095e
	404	.long 0x69edf6c5
	405	.long 0x67cde3a5
406	.long 0x65cdd125
407	.long 0x63cdbe25
408	.long 0x61ddab3f
409	.long 0x600d991f
410	.long 0x5e3d868c
411	.long 0x5c6d7384
412	.long 0x5abd615f
413	.long 0x590d4ecd
414	.long 0x576d3c83
415	.long 0x55dd2a89
416	.long 0x545d18e9
417	.long 0x52dd06e9
418	.long 0x516cf54e
419	.long 0x4ffce356
420	.long 0x4e9cd1ce
421	.long 0x4d3cbfec
422	.long 0x4becae86
423	.long 0x4aac9da4
424	.long 0x496c8c73
425	.long 0x483c7bd3
426	.long 0x470c6ae8
427	.long 0x45dc59af
428	.long 0x44bc4915
429	.long 0x43ac3924
430	.long 0x428c27fb
431	.long 0x418c187a
432	.long 0x407c07bd
433	.L7ff00000:
434	.long 0x7ff00000
435	ENDFUNC(__divdf3)