]>
Commit | Line | Data |
---|---|---|
83ffe9cd | 1 | /* Copyright (C) 2011-2023 Free Software Foundation, Inc. |
feeeff5c JR |
2 | Contributed by Embecosm on behalf of Adapteva, Inc. |
3 | ||
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it under | |
7 | the terms of the GNU General Public License as published by the Free | |
8 | Software Foundation; either version 3, or (at your option) any later | |
9 | version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | Under Section 7 of GPL version 3, you are granted additional | |
17 | permissions described in the GCC Runtime Library Exception, version | |
18 | 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | You should have received a copy of the GNU General Public License and | |
21 | a copy of the GCC Runtime Library Exception along with this program; | |
22 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | <http://www.gnu.org/licenses/>. */ | |
24 | ||
25 | #include "../epiphany-asm.h" | |
26 | ||
27 | .section _fast_div_text,"a",@progbits; | |
28 | .balign 8; | |
29 | _fast_div_table: | |
30 | .word 0x007fffff// mantissa mask | |
31 | .word 0x40257ebb// hold constant a = 2.58586 | |
32 | ||
33 | .word 0x3f000000// hold constant 126 shifted to bits [30:23] | |
34 | .word 0xc0ba2e88// hold constant b = -5.81818 | |
35 | ||
36 | .word 0x4087c1e8// hold constant c = 4.24242 | |
37 | .word 0x40000000// to hold constant 2 for Newton-Raphson iterations | |
38 | ||
39 | .global SYM(__fast_recipsf2) | |
40 | FUNC(__fast_recipsf2) | |
41 | SYM(__fast_recipsf2): | |
42 | ||
43 | //################### | |
44 | //# input operands: | |
45 | //################### | |
46 | // Divisor | |
47 | //R0 | |
48 | // Function address (used with negative offsets to read _fast_div_table) | |
49 | //R1 | |
50 | /* Scratch registers: two single (TMP0/TMP5) and two pairs. */ | |
51 | #define P0L TMP1 | |
52 | #define P0H TMP2 | |
53 | #define P1L TMP3 | |
54 | #define P1H TMP4 | |
55 | ||
56 | //######################################### | |
57 | //# Constants to be used in the algorithm | |
58 | //######################################### | |
59 | ldrd P0L , [ R1 , -3 ] | |
60 | ||
61 | ldrd P1L , [ R1 , -2 ] | |
62 | ||
63 | ||
64 | ||
65 | //############################################################################# | |
66 | //# The Algorithm | |
67 | //# | |
68 | //# Operation: C=A/B | |
69 | //# stage 1 - find the reciprocal 1/B according to the following scheme: | |
70 | //# B = (2^E)*m (1<m<2, E=e-127) | |
71 | //# 1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1) (0.5<m1<1) | |
72 | //# = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1) | |
73 | //# | |
74 | //# Now we can find the new exponent: | |
75 | //# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e ** | |
76 | //# 1/m1 alreadt has the exponent 127, so we have to add 126-e. | |
77 | //# the exponent might underflow, which we can detect as a sign change. | |
78 | //# Since the architeture uses flush-to-zero for subnormals, we can | |
79 | //# give the result 0. then. | |
80 | //# | |
81 | //# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial | |
82 | //# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242 | |
83 | //# | |
84 | //# Next step is to use two iterations of Newton-Raphson algorithm to complete | |
85 | //# the reciprocal calculation. | |
86 | //# | |
87 | //# Final result is achieved by multiplying A with 1/B | |
88 | //############################################################################# | |
89 | ||
90 | ||
91 | ||
92 | // R0 exponent and sign "replacement" into TMP0 | |
93 | AND TMP0,R0,P0L ; | |
94 | ORR TMP0,TMP0,P1L | |
95 | SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5 | |
96 | // Calculate new mantissa | |
97 | FMADD P1H,TMP0,P0H ; | |
98 | // Calculate new exponent offset 126 - "old exponent" | |
99 | SUB P1L,P1L,TMP5 | |
100 | ldrd P0L , [ R1 , -1 ] | |
101 | FMADD P0L,TMP0,P1H ; | |
102 | eor P1H,r0,P1L // check for overflow (N-BIT). | |
103 | blt .Lret_0 | |
104 | // P0L exponent and sign "replacement" | |
105 | sub P0L,P0L,TMP5 | |
106 | ||
107 | // Newton-Raphson iteration #1 | |
108 | MOV TMP0,P0H ; | |
109 | FMSUB P0H,R0,P0L ; | |
110 | FMUL P0L,P0H,P0L ; | |
111 | // Newton-Raphson iteration #2 | |
112 | FMSUB TMP0,R0,P0L ; | |
113 | FMUL R0,TMP0,P0L ; | |
114 | jr lr | |
115 | .Lret_0:ldrd P0L , [ R1 , -3 ] | |
116 | lsr TMP0,r0,31 ; extract sign | |
117 | lsl TMP0,TMP0,31 | |
118 | add P0L,P0L,r0 ; check for NaN input | |
119 | eor P0L,P0L,r0 | |
120 | movgte r0,TMP0 | |
121 | jr lr | |
122 | // Quotient calculation is expected by the caller: FMUL quotient,divident,R0 | |
123 | ; | |
124 | ENDFUNC(__fast_recipsf2) |