[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / divsf3-stdmul.S

/* Copyright (C) 2008-2021 Free Software Foundation, Inc.
   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
		on behalf of Synopsys Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*
   - calculate 15..18 bit inverse using a table of approximating polynoms.
     precision is higher for polynoms used to evaluate input with larger
     value.
   - do one newton-raphson iteration step to double the precision,
     then multiply this with the divisor
	-> more time to decide if dividend is subnormal
     - the worst error propagation is on the side of the value range
       with the least initial defect, thus giving us about 30 bits precision.
 */
#include "arc-ieee-754.h"

#if 0 /* DEBUG */
	.global __divsf3
	FUNC(__divsf3)
	.balign 4
__divsf3:
	push_s blink
	push_s r1
	bl.d __divsf3_c
	push_s r0
	ld_s r1,[sp,4]
	st_s r0,[sp,4]
	bl.d __divsf3_asm
	pop_s r0
	pop_s r1
	pop_s blink
	cmp r0,r1
#if 1
	bne abort
	jeq_s [blink]
	b abort
#else
	bne abort
	j_s [blink]
#endif
	ENDFUNC(__divsf3)
#define __divsf3 __divsf3_asm
#endif /* DEBUG */

	FUNC(__divsf3)
	.balign 4
.L7f800000:
	.long 0x7f800000
.Ldivtab:
	.long 0xfc0ffff0
	.long 0xf46ffefd
	.long 0xed1ffd2a
	.long 0xe627fa8e
	.long 0xdf7ff73b
	.long 0xd917f33b
	.long 0xd2f7eea3
	.long 0xcd1fe986
	.long 0xc77fe3e7
	.long 0xc21fdddb
	.long 0xbcefd760
	.long 0xb7f7d08c
	.long 0xb32fc960
	.long 0xae97c1ea
	.long 0xaa27ba26
	.long 0xa5e7b22e
	.long 0xa1cfa9fe
	.long 0x9ddfa1a0
	.long 0x9a0f990c
	.long 0x9667905d
	.long 0x92df878a
	.long 0x8f6f7e84
	.long 0x8c27757e
	.long 0x88f76c54
	.long 0x85df630c
	.long 0x82e759c5
	.long 0x8007506d
	.long 0x7d3f470a
	.long 0x7a8f3da2
	.long 0x77ef341e
	.long 0x756f2abe
	.long 0x72f7212d
	.long 0x709717ad
	.long 0x6e4f0e44
	.long 0x6c1704d6
	.long 0x69e6fb44
	.long 0x67cef1d7
	.long 0x65c6e872
	.long 0x63cedf18
	.long 0x61e6d5cd
	.long 0x6006cc6d
	.long 0x5e36c323
	.long 0x5c76b9f3
	.long 0x5abeb0b7
	.long 0x5916a79b
	.long 0x57769e77
	.long 0x55de954d
	.long 0x54568c4e
	.long 0x52d6834d
	.long 0x51667a7f
	.long 0x4ffe71b5
	.long 0x4e9e68f1
	.long 0x4d466035
	.long 0x4bf65784
	.long 0x4aae4ede
	.long 0x496e4646
	.long 0x48363dbd
	.long 0x47063547
	.long 0x45de2ce5
	.long 0x44be2498
	.long 0x43a61c64
	.long 0x4296144a
	.long 0x41860c0e
	.long 0x407e03ee
__divsf3_support: /* This label makes debugger output saner.  */
.Ldenorm_fp1:
	bclr r6,r6,31
	norm.f r12,r6 ; flag for x/0 -> Inf check
	add r6,r6,r6
	rsub r5,r12,16
	ror r5,r1,r5
	asl r6,r6,r12
	bmsk r5,r5,5
	ld.as r5,[r3,r5]
	add r4,r6,r6
	; load latency
	MPYHU r7,r5,r4
	bic.ne.f 0, \
		0x60000000,r0 ; large number / denorm -> Inf
	beq_s .Linf_NaN
	asl r5,r5,13
	; wb stall
	; slow track
	sub r7,r5,r7
	MPYHU r8,r7,r6
	asl_s r12,r12,23
	and.f r2,r0,r9
	add r2,r2,r12
	asl r12,r0,8
	; wb stall
	bne.d .Lpast_denorm_fp1
.Ldenorm_fp0:
	MPYHU r8,r8,r7
	bclr r12,r12,31
	norm.f r3,r12 ; flag for 0/x -> 0 check
	bic.ne.f 0,0x60000000,r1 ; denorm/large number -> 0
	beq_s .Lret0
	asl_s r12,r12,r3
	asl_s r3,r3,23
	add_s r12,r12,r12
	add r11,r11,r3
	b.d .Lpast_denorm_fp0
	mov_s r3,r12
	.balign 4
.Linf_NaN:
	bclr.f 0,r0,31 ; 0/0 -> NaN
	xor_s r0,r0,r1
	bmsk r1,r0,30
	bic_s r0,r0,r1
	sub.eq r0,r0,1
	j_s.d [blink]
	or r0,r0,r9
.Lret0:
	xor_s r0,r0,r1
	bmsk r1,r0,30
	j_s.d [blink]
	bic_s r0,r0,r1
.Linf_nan_fp1:
	lsr_s r0,r0,31
	bmsk.f 0,r1,22
	asl_s r0,r0,31
	bne_s 0f ; inf/inf -> nan
	brne r2,r9,.Lsigned0 ; x/inf -> 0, but x/nan -> nan
0:	j_s.d [blink]
	mov r0,-1
.Lsigned0:
.Linf_nan_fp0:
	tst_s r1,r1
	j_s.d [blink]
	bxor.mi r0,r0,31
	.balign 4
	.global __divsf3
/* N.B. the spacing between divtab and the sub3 to get its address must
   be a multiple of 8.  */
__divsf3:
	lsr r2,r1,17
	sub3 r3,pcl,55;(.-.Ldivtab) >> 3
	bmsk_s r2,r2,5
	ld.as r5,[r3,r2]
	asl r4,r1,9
	ld.as r9,[pcl,-114]; [pcl,(-((.-.L7f800000) >> 2))] ; 0x7f800000
	MPYHU r7,r5,r4
	asl r6,r1,8
	and.f r11,r1,r9
	bset r6,r6,31
	asl r5,r5,13
	; wb stall
	beq .Ldenorm_fp1
	sub r7,r5,r7
	MPYHU r8,r7,r6
	breq.d r11,r9,.Linf_nan_fp1
	and.f r2,r0,r9
	beq.d .Ldenorm_fp0
	asl r12,r0,8
	; wb stall
	breq r2,r9,.Linf_nan_fp0
	MPYHU r8,r8,r7
.Lpast_denorm_fp1:
	bset r3,r12,31
.Lpast_denorm_fp0:
	cmp_s r3,r6
	lsr.cc r3,r3,1
	add_s r2,r2, /* wait for immediate */ \
	/* wb stall */ \
		0x3f000000
	sub r7,r7,r8 ; u1.31 inverse, about 30 bit
	MPYHU r3,r3,r7
	sbc r2,r2,r11
	xor.f 0,r0,r1
	and r0,r2,r9
	bxor.mi r0,r0,31
	brhs r2, /* wb stall / wait for immediate */ \
		0x7f000000,.Linf_denorm
.Lpast_denorm:
	add_s r3,r3,0x22 ; round to nearest or higher
	tst r3,0x3c ; check if rounding was unsafe
	lsr r3,r3,6
	jne.d [blink] ; return if rounding was safe.
	add_s r0,r0,r3
        /* work out exact rounding if we fall through here.  */
        /* We know that the exact result cannot be represented in single
           precision.  Find the mid-point between the two nearest
           representable values, multiply with the divisor, and check if
           the result is larger than the dividend.  */
        add_s r3,r3,r3
        sub_s r3,r3,1
        mpyu r3,r3,r6
	asr.f 0,r0,1 ; for round-to-even in case this is a denorm
	rsub r2,r9,25
        asl_s r12,r12,r2
	; wb stall
	; slow track
        sub.f 0,r12,r3
        j_s.d [blink]
        sub.mi r0,r0,1
/* For denormal results, it is possible that an exact result needs
   rounding, and thus the round-to-even rule has to come into play.  */
.Linf_denorm:
	brlo r2,0xc0000000,.Linf
.Ldenorm:
	asr_s r2,r2,23
	bic r0,r0,r9
	neg r9,r2
	brlo.d r9,25,.Lpast_denorm
	lsr r3,r3,r9
	/* Fall through: return +- 0 */
	j_s [blink]
.Linf:
	j_s.d [blink]
	or r0,r0,r9
	ENDFUNC(__divsf3)
Commit	Line	Data
99dee823	1	/* Copyright (C) 2008-2021 Free Software Foundation, Inc.
d38a64b4 JR	2	Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
	3	on behalf of Synopsys Inc.
	4
	5	This file is part of GCC.
	6
	7	GCC is free software; you can redistribute it and/or modify it under
	8	the terms of the GNU General Public License as published by the Free
	9	Software Foundation; either version 3, or (at your option) any later
	10	version.
	11
	12	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	14	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	15	for more details.
	16
	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
	25
	26	/*
	27	- calculate 15..18 bit inverse using a table of approximating polynoms.
	28	precision is higher for polynoms used to evaluate input with larger
	29	value.
	30	- do one newton-raphson iteration step to double the precision,
	31	then multiply this with the divisor
	32	-> more time to decide if dividend is subnormal
	33	- the worst error propagation is on the side of the value range
	34	with the least initial defect, thus giving us about 30 bits precision.
	35	*/
	36	#include "arc-ieee-754.h"
	37
	38	#if 0 /* DEBUG */
	39	.global __divsf3
	40	FUNC(__divsf3)
	41	.balign 4
	42	__divsf3:
	43	push_s blink
	44	push_s r1
	45	bl.d __divsf3_c
	46	push_s r0
	47	ld_s r1,[sp,4]
	48	st_s r0,[sp,4]
	49	bl.d __divsf3_asm
	50	pop_s r0
	51	pop_s r1
	52	pop_s blink
	53	cmp r0,r1
	54	#if 1
	55	bne abort
	56	jeq_s [blink]
	57	b abort
	58	#else
	59	bne abort
	60	j_s [blink]
	61	#endif
	62	ENDFUNC(__divsf3)
	63	#define __divsf3 __divsf3_asm
	64	#endif /* DEBUG */
	65
66	FUNC(__divsf3)
67	.balign 4
68	.L7f800000:
69	.long 0x7f800000
70	.Ldivtab:
71	.long 0xfc0ffff0
72	.long 0xf46ffefd
73	.long 0xed1ffd2a
74	.long 0xe627fa8e
75	.long 0xdf7ff73b
76	.long 0xd917f33b
77	.long 0xd2f7eea3
78	.long 0xcd1fe986
79	.long 0xc77fe3e7
80	.long 0xc21fdddb
81	.long 0xbcefd760
82	.long 0xb7f7d08c
83	.long 0xb32fc960
84	.long 0xae97c1ea
85	.long 0xaa27ba26
86	.long 0xa5e7b22e
87	.long 0xa1cfa9fe
88	.long 0x9ddfa1a0
89	.long 0x9a0f990c
90	.long 0x9667905d
91	.long 0x92df878a
92	.long 0x8f6f7e84
93	.long 0x8c27757e
94	.long 0x88f76c54
95	.long 0x85df630c
96	.long 0x82e759c5
97	.long 0x8007506d
98	.long 0x7d3f470a
99	.long 0x7a8f3da2
100	.long 0x77ef341e
101	.long 0x756f2abe
102	.long 0x72f7212d
103	.long 0x709717ad
104	.long 0x6e4f0e44
105	.long 0x6c1704d6
106	.long 0x69e6fb44
107	.long 0x67cef1d7
108	.long 0x65c6e872
109	.long 0x63cedf18
110	.long 0x61e6d5cd
111	.long 0x6006cc6d
112	.long 0x5e36c323
113	.long 0x5c76b9f3
114	.long 0x5abeb0b7
115	.long 0x5916a79b
116	.long 0x57769e77
117	.long 0x55de954d
118	.long 0x54568c4e
119	.long 0x52d6834d
120	.long 0x51667a7f
121	.long 0x4ffe71b5
122	.long 0x4e9e68f1
123	.long 0x4d466035
124	.long 0x4bf65784
125	.long 0x4aae4ede
126	.long 0x496e4646
127	.long 0x48363dbd
128	.long 0x47063547
129	.long 0x45de2ce5
130	.long 0x44be2498
131	.long 0x43a61c64
132	.long 0x4296144a
133	.long 0x41860c0e
134	.long 0x407e03ee
135	__divsf3_support: /* This label makes debugger output saner. */
136	.Ldenorm_fp1:
137	bclr r6,r6,31
138	norm.f r12,r6 ; flag for x/0 -> Inf check
139	add r6,r6,r6
140	rsub r5,r12,16
141	ror r5,r1,r5
142	asl r6,r6,r12
143	bmsk r5,r5,5
144	ld.as r5,[r3,r5]
145	add r4,r6,r6
146	; load latency
c0ab1970	147	MPYHU r7,r5,r4
d38a64b4 JR	148	bic.ne.f 0, \
	149	0x60000000,r0 ; large number / denorm -> Inf
	150	beq_s .Linf_NaN
	151	asl r5,r5,13
	152	; wb stall
	153	; slow track
	154	sub r7,r5,r7
c0ab1970	155	MPYHU r8,r7,r6
d38a64b4 JR	156	asl_s r12,r12,23
	157	and.f r2,r0,r9
	158	add r2,r2,r12
	159	asl r12,r0,8
	160	; wb stall
	161	bne.d .Lpast_denorm_fp1
	162	.Ldenorm_fp0:
c0ab1970	163	MPYHU r8,r8,r7
d38a64b4 JR	164	bclr r12,r12,31
	165	norm.f r3,r12 ; flag for 0/x -> 0 check
	166	bic.ne.f 0,0x60000000,r1 ; denorm/large number -> 0
	167	beq_s .Lret0
	168	asl_s r12,r12,r3
	169	asl_s r3,r3,23
	170	add_s r12,r12,r12
	171	add r11,r11,r3
	172	b.d .Lpast_denorm_fp0
	173	mov_s r3,r12
	174	.balign 4
	175	.Linf_NaN:
	176	bclr.f 0,r0,31 ; 0/0 -> NaN
	177	xor_s r0,r0,r1
	178	bmsk r1,r0,30
	179	bic_s r0,r0,r1
	180	sub.eq r0,r0,1
	181	j_s.d [blink]
	182	or r0,r0,r9
	183	.Lret0:
	184	xor_s r0,r0,r1
	185	bmsk r1,r0,30
	186	j_s.d [blink]
	187	bic_s r0,r0,r1
	188	.Linf_nan_fp1:
	189	lsr_s r0,r0,31
	190	bmsk.f 0,r1,22
	191	asl_s r0,r0,31
	192	bne_s 0f ; inf/inf -> nan
	193	brne r2,r9,.Lsigned0 ; x/inf -> 0, but x/nan -> nan
	194	0: j_s.d [blink]
	195	mov r0,-1
	196	.Lsigned0:
	197	.Linf_nan_fp0:
	198	tst_s r1,r1
	199	j_s.d [blink]
	200	bxor.mi r0,r0,31
	201	.balign 4
	202	.global __divsf3
	203	/* N.B. the spacing between divtab and the sub3 to get its address must
	204	be a multiple of 8. */
	205	__divsf3:
	206	lsr r2,r1,17
	207	sub3 r3,pcl,55;(.-.Ldivtab) >> 3
	208	bmsk_s r2,r2,5
	209	ld.as r5,[r3,r2]
	210	asl r4,r1,9
	211	ld.as r9,[pcl,-114]; [pcl,(-((.-.L7f800000) >> 2))] ; 0x7f800000
c0ab1970	212	MPYHU r7,r5,r4
d38a64b4 JR	213	asl r6,r1,8
	214	and.f r11,r1,r9
	215	bset r6,r6,31
	216	asl r5,r5,13
	217	; wb stall
	218	beq .Ldenorm_fp1
	219	sub r7,r5,r7
c0ab1970	220	MPYHU r8,r7,r6
d38a64b4 JR	221	breq.d r11,r9,.Linf_nan_fp1
	222	and.f r2,r0,r9
	223	beq.d .Ldenorm_fp0
	224	asl r12,r0,8
	225	; wb stall
	226	breq r2,r9,.Linf_nan_fp0
c0ab1970	227	MPYHU r8,r8,r7
d38a64b4 JR	228	.Lpast_denorm_fp1:
	229	bset r3,r12,31
	230	.Lpast_denorm_fp0:
	231	cmp_s r3,r6
	232	lsr.cc r3,r3,1
	233	add_s r2,r2, /* wait for immediate */ \
	234	/* wb stall */ \
	235	0x3f000000
	236	sub r7,r7,r8 ; u1.31 inverse, about 30 bit
c0ab1970	237	MPYHU r3,r3,r7
d38a64b4 JR	238	sbc r2,r2,r11
	239	xor.f 0,r0,r1
	240	and r0,r2,r9
	241	bxor.mi r0,r0,31
	242	brhs r2, /* wb stall / wait for immediate */ \
	243	0x7f000000,.Linf_denorm
	244	.Lpast_denorm:
	245	add_s r3,r3,0x22 ; round to nearest or higher
	246	tst r3,0x3c ; check if rounding was unsafe
	247	lsr r3,r3,6
	248	jne.d [blink] ; return if rounding was safe.
	249	add_s r0,r0,r3
	250	/* work out exact rounding if we fall through here. */
	251	/* We know that the exact result cannot be represented in single
	252	precision. Find the mid-point between the two nearest
	253	representable values, multiply with the divisor, and check if
	254	the result is larger than the dividend. */
	255	add_s r3,r3,r3
	256	sub_s r3,r3,1
	257	mpyu r3,r3,r6
	258	asr.f 0,r0,1 ; for round-to-even in case this is a denorm
	259	rsub r2,r9,25
	260	asl_s r12,r12,r2
	261	; wb stall
	262	; slow track
	263	sub.f 0,r12,r3
	264	j_s.d [blink]
	265	sub.mi r0,r0,1
	266	/* For denormal results, it is possible that an exact result needs
	267	rounding, and thus the round-to-even rule has to come into play. */
	268	.Linf_denorm:
	269	brlo r2,0xc0000000,.Linf
	270	.Ldenorm:
	271	asr_s r2,r2,23
	272	bic r0,r0,r9
	273	neg r9,r2
	274	brlo.d r9,25,.Lpast_denorm
	275	lsr r3,r3,r9
	276	/* Fall through: return +- 0 */
	277	j_s [blink]
	278	.Linf:
	279	j_s.d [blink]
	280	or r0,r0,r9
	281	ENDFUNC(__divsf3)