[thirdparty/gcc.git] / libgcc / config / epiphany / ieee-754 / fast_div.S

/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
   Contributed by Embecosm on behalf of Adapteva, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

#include "../epiphany-asm.h"

.section _fast_div_text,"a",@progbits;
  .balign 8;
_fast_div_table:
.word 0x007fffff//   mantissa mask
.word 0x40257ebb//   hold constant a = 2.58586

.word 0x3f000000//   hold constant 126 shifted to bits [30:23]
.word 0xc0ba2e88//   hold constant b = -5.81818

.word 0x4087c1e8//   hold constant c = 4.24242
.word 0x40000000//  to hold constant 2 for Newton-Raphson iterations

 .global SYM(__fast_recipsf2)
 FUNC(__fast_recipsf2)
SYM(__fast_recipsf2):

//###################
//# input operands:
//###################
// Divisor
//R0
// Function address (used with negative offsets to read _fast_div_table)
//R1
/* Scratch registers:  two single (TMP0/TMP5) and two pairs.  */
#define P0L TMP1
#define P0H TMP2
#define P1L TMP3
#define P1H TMP4

//#########################################
//# Constants to be used in the algorithm
//#########################################
ldrd P0L , [ R1 , -3 ]

ldrd P1L , [ R1 , -2 ]


//#############################################################################
//#                       The Algorithm
//#
//# Operation: C=A/B
//# stage 1 - find the reciprocal 1/B according to the following scheme:
//#  B = (2^E)*m                                (1<m<2, E=e-127)
//#  1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1)          (0.5<m1<1)
//#      = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1)
//#
//# Now we can find the new exponent:
//# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e **
//# 1/m1 alreadt has the exponent 127, so we have to add 126-e.
//# the exponent might underflow, which we can detect as a sign change.
//# Since the architeture uses flush-to-zero for subnormals, we can
//# give the result 0. then.
//#
//# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial
//# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242
//#
//# Next step is to use two iterations of Newton-Raphson algorithm to complete
//# the reciprocal calculation.
//#
//# Final result is achieved by multiplying A with 1/B
//#############################################################################


// R0 exponent and sign "replacement" into TMP0
AND TMP0,R0,P0L		 ;
ORR TMP0,TMP0,P1L
SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5
// Calculate new mantissa
FMADD P1H,TMP0,P0H	         ;
		// Calculate new exponent offset 126 - "old exponent"
		SUB P1L,P1L,TMP5
	ldrd P0L , [ R1 , -1 ]
FMADD P0L,TMP0,P1H	         ;
		eor P1H,r0,P1L // check for overflow (N-BIT).
		blt .Lret_0
// P0L exponent and sign "replacement"
sub P0L,P0L,TMP5

// Newton-Raphson iteration #1
MOV TMP0,P0H	         ;
FMSUB P0H,R0,P0L	 ;
FMUL  P0L,P0H,P0L	 ;
// Newton-Raphson iteration #2
FMSUB TMP0,R0,P0L	;
FMUL  R0,TMP0,P0L	         ;
jr lr
.Lret_0:ldrd P0L , [ R1 , -3 ]
	lsr TMP0,r0,31 ; extract sign
	lsl TMP0,TMP0,31
	add P0L,P0L,r0 ; check for NaN input
	eor P0L,P0L,r0
	movgte r0,TMP0
	jr lr
// Quotient calculation is expected by the caller: FMUL quotient,divident,R0
        ;
	ENDFUNC(__fast_recipsf2)
Commit	Line	Data
83ffe9cd	1	/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
feeeff5c JR	2	Contributed by Embecosm on behalf of Adapteva, Inc.
	3
	4	This file is part of GCC.
	5
	6	GCC is free software; you can redistribute it and/or modify it under
	7	the terms of the GNU General Public License as published by the Free
	8	Software Foundation; either version 3, or (at your option) any later
	9	version.
	10
	11	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	12	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	13	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	14	for more details.
	15
	16	Under Section 7 of GPL version 3, you are granted additional
	17	permissions described in the GCC Runtime Library Exception, version
	18	3.1, as published by the Free Software Foundation.
	19
	20	You should have received a copy of the GNU General Public License and
	21	a copy of the GCC Runtime Library Exception along with this program;
	22	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	23	<http://www.gnu.org/licenses/>. */
	24
	25	#include "../epiphany-asm.h"
	26
	27	.section _fast_div_text,"a",@progbits;
	28	.balign 8;
	29	_fast_div_table:
	30	.word 0x007fffff// mantissa mask
	31	.word 0x40257ebb// hold constant a = 2.58586
	32
	33	.word 0x3f000000// hold constant 126 shifted to bits [30:23]
	34	.word 0xc0ba2e88// hold constant b = -5.81818
	35
	36	.word 0x4087c1e8// hold constant c = 4.24242
	37	.word 0x40000000// to hold constant 2 for Newton-Raphson iterations
	38
	39	.global SYM(__fast_recipsf2)
	40	FUNC(__fast_recipsf2)
	41	SYM(__fast_recipsf2):
	42
	43	//###################
	44	//# input operands:
	45	//###################
	46	// Divisor
	47	//R0
	48	// Function address (used with negative offsets to read _fast_div_table)
	49	//R1
	50	/* Scratch registers: two single (TMP0/TMP5) and two pairs. */
	51	#define P0L TMP1
	52	#define P0H TMP2
	53	#define P1L TMP3
	54	#define P1H TMP4
	55
	56	//#########################################
	57	//# Constants to be used in the algorithm
	58	//#########################################
	59	ldrd P0L , [ R1 , -3 ]
	60
	61	ldrd P1L , [ R1 , -2 ]
	62
	63
	64
	65	//#############################################################################
66	//# The Algorithm
67	//#
68	//# Operation: C=A/B
69	//# stage 1 - find the reciprocal 1/B according to the following scheme:
70	//# B = (2^E)*m (1<m<2, E=e-127)
71	//# 1/B = 1/((2^E)m) = 1/((2^(E+1))m1) (0.5<m1<1)
72	//# = (2^-(E+1))(1/m1) = (2^E1)(1/m1)
73	//#
74	//# Now we can find the new exponent:
75	//# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e **
76	//# 1/m1 alreadt has the exponent 127, so we have to add 126-e.
77	//# the exponent might underflow, which we can detect as a sign change.
78	//# Since the architeture uses flush-to-zero for subnormals, we can
79	//# give the result 0. then.
80	//#
81	//# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial
82	//# 1/m1 = 2.58586(m1^2) - 5.81818m1 + 4.24242
83	//#
84	//# Next step is to use two iterations of Newton-Raphson algorithm to complete
85	//# the reciprocal calculation.
86	//#
87	//# Final result is achieved by multiplying A with 1/B
88	//#############################################################################
89
90
91
92	// R0 exponent and sign "replacement" into TMP0
93	AND TMP0,R0,P0L ;
94	ORR TMP0,TMP0,P1L
95	SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5
96	// Calculate new mantissa
97	FMADD P1H,TMP0,P0H ;
98	// Calculate new exponent offset 126 - "old exponent"
99	SUB P1L,P1L,TMP5
100	ldrd P0L , [ R1 , -1 ]
101	FMADD P0L,TMP0,P1H ;
102	eor P1H,r0,P1L // check for overflow (N-BIT).
103	blt .Lret_0
104	// P0L exponent and sign "replacement"
105	sub P0L,P0L,TMP5
106
107	// Newton-Raphson iteration #1
108	MOV TMP0,P0H ;
109	FMSUB P0H,R0,P0L ;
110	FMUL P0L,P0H,P0L ;
111	// Newton-Raphson iteration #2
112	FMSUB TMP0,R0,P0L ;
113	FMUL R0,TMP0,P0L ;
114	jr lr
115	.Lret_0:ldrd P0L , [ R1 , -3 ]
116	lsr TMP0,r0,31 ; extract sign
117	lsl TMP0,TMP0,31
118	add P0L,P0L,r0 ; check for NaN input
119	eor P0L,P0L,r0
120	movgte r0,TMP0
121	jr lr
122	// Quotient calculation is expected by the caller: FMUL quotient,divident,R0
123	;
124	ENDFUNC(__fast_recipsf2)