]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgcc/config/libbid/bid_sqrt_macros.h
Update copyright years.
[thirdparty/gcc.git] / libgcc / config / libbid / bid_sqrt_macros.h
1 /* Copyright (C) 2007-2016 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 #ifndef _SQRT_MACROS_H_
25 #define _SQRT_MACROS_H_
26
27 #define FENCE __fence
28
29 #if DOUBLE_EXTENDED_ON
30
31 extern BINARY80 SQRT80 (BINARY80);
32
33
34 __BID_INLINE__ UINT64
35 short_sqrt128 (UINT128 A10) {
36 BINARY80 lx, ly, l64;
37 int_float f64;
38
39 // 2^64
40 f64.i = 0x5f800000;
41 l64 = (BINARY80) f64.d;
42 lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
43 ly = SQRT80 (lx);
44 return (UINT64) ly;
45 }
46
47
48 __BID_INLINE__ void
49 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
50 UINT256 C4;
51 UINT128 CS;
52 UINT64 X;
53 SINT64 SE;
54 BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
55 l1, l0, lp, lCl;
56 int_float fx, f64, fm64;
57 int *ple = (int *) &lx;
58
59 // 2^64
60 f64.i = 0x5f800000;
61 l64 = (BINARY80) f64.d;
62
63 l128 = l64 * l64;
64 lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
65 l2 = (BINARY80) C256.w[2] * l128;
66 lx = FENCE (lx + l2);
67 l1 = (BINARY80) C256.w[1] * l64;
68 lx = FENCE (lx + l1);
69 l0 = (BINARY80) C256.w[0];
70 lx = FENCE (lx + l0);
71 // sqrt(C256)
72 lS = SQRT80 (lx);
73
74 // get coefficient
75 // 2^(-64)
76 fm64.i = 0x1f800000;
77 lm64 = (BINARY80) fm64.d;
78 CS.w[1] = (UINT64) (lS * lm64);
79 CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
80
81 ///////////////////////////////////////
82 // CAUTION!
83 // little endian code only
84 // add solution for big endian
85 //////////////////////////////////////
86 lSH = lS;
87 *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
88
89 // correction for C256 rounding
90 lCl = FENCE (l3 - lx);
91 lCl = FENCE (lCl + l2);
92 lCl = FENCE (lCl + l1);
93 lCl = FENCE (lCl + l0);
94
95 lSL = lS - lSH;
96
97 //////////////////////////////////////////
98 // Watch for compiler re-ordering
99 //
100 /////////////////////////////////////////
101 // C256-S^2
102 lxL = FENCE (lx - lSH * lSH);
103 lp = lSH * lSL;
104 lp += lp;
105 lxL = FENCE (lxL - lp);
106 lSL *= lSL;
107 lxL = FENCE (lxL - lSL);
108 lCl += lxL;
109
110 // correction term
111 lE = lCl / (lS + lS);
112
113 // get low part of coefficient
114 X = CS.w[0];
115 if (lCl >= 0) {
116 SE = (SINT64) (lE);
117 CS.w[0] += SE;
118 if (CS.w[0] < X)
119 CS.w[1]++;
120 } else {
121 SE = (SINT64) (-lE);
122 CS.w[0] -= SE;
123 if (CS.w[0] > X)
124 CS.w[1]--;
125 }
126
127 pCS->w[0] = CS.w[0];
128 pCS->w[1] = CS.w[1];
129 }
130
131 #else
132
133 extern double sqrt (double);
134
135 __BID_INLINE__ UINT64
136 short_sqrt128 (UINT128 A10) {
137 UINT256 ARS, ARS0, AE0, AE, S;
138
139 UINT64 MY, ES, CY;
140 double lx, l64;
141 int_double f64, ly;
142 int ey, k;
143
144 // 2^64
145 f64.i = 0x43f0000000000000ull;
146 l64 = f64.d;
147 lx = (double) A10.w[1] * l64 + (double) A10.w[0];
148 ly.d = 1.0 / sqrt (lx);
149
150 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
151 ey = 0x3ff - (ly.i >> 52);
152
153 // A10*RS^2
154 __mul_64x128_to_192 (ARS0, MY, A10);
155 __mul_64x192_to_256 (ARS, MY, ARS0);
156
157 // shr by 2*ey+40, to get a 64-bit value
158 k = (ey << 1) + 104 - 64;
159 if (k >= 128) {
160 if (k > 128)
161 ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
162 else
163 ES = ARS.w[2];
164 } else {
165 if (k >= 64) {
166 ARS.w[0] = ARS.w[1];
167 ARS.w[1] = ARS.w[2];
168 k -= 64;
169 }
170 if (k) {
171 __shr_128 (ARS, ARS, k);
172 }
173 ES = ARS.w[0];
174 }
175
176 ES = ((SINT64) ES) >> 1;
177
178 if (((SINT64) ES) < 0) {
179 ES = -ES;
180
181 // A*RS*eps (scaled by 2^64)
182 __mul_64x192_to_256 (AE0, ES, ARS0);
183
184 AE.w[0] = AE0.w[1];
185 AE.w[1] = AE0.w[2];
186 AE.w[2] = AE0.w[3];
187
188 __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
189 __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
190 S.w[2] = ARS0.w[2] + AE.w[2] + CY;
191 } else {
192 // A*RS*eps (scaled by 2^64)
193 __mul_64x192_to_256 (AE0, ES, ARS0);
194
195 AE.w[0] = AE0.w[1];
196 AE.w[1] = AE0.w[2];
197 AE.w[2] = AE0.w[3];
198
199 __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
200 __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
201 S.w[2] = ARS0.w[2] - AE.w[2] - CY;
202 }
203
204 k = ey + 51;
205
206 if (k >= 64) {
207 if (k >= 128) {
208 S.w[0] = S.w[2];
209 S.w[1] = 0;
210 k -= 128;
211 } else {
212 S.w[0] = S.w[1];
213 S.w[1] = S.w[2];
214 }
215 k -= 64;
216 }
217 if (k) {
218 __shr_128 (S, S, k);
219 }
220
221
222 return (UINT64) ((S.w[0] + 1) >> 1);
223
224 }
225
226
227
228 __BID_INLINE__ void
229 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
230 UINT512 ARS0, ARS;
231 UINT256 ARS00, AE, AE2, S;
232 UINT128 ES, ES2, ARS1;
233 UINT64 ES32, CY, MY;
234 double l64, l128, lx, l2, l1, l0;
235 int_double f64, ly;
236 int ey, k, k2;
237
238 // 2^64
239 f64.i = 0x43f0000000000000ull;
240 l64 = f64.d;
241
242 l128 = l64 * l64;
243 lx = (double) C256.w[3] * l64 * l128;
244 l2 = (double) C256.w[2] * l128;
245 lx = FENCE (lx + l2);
246 l1 = (double) C256.w[1] * l64;
247 lx = FENCE (lx + l1);
248 l0 = (double) C256.w[0];
249 lx = FENCE (lx + l0);
250 // sqrt(C256)
251 ly.d = 1.0 / sqrt (lx);
252
253 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
254 ey = 0x3ff - (ly.i >> 52);
255
256 // A10*RS^2, scaled by 2^(2*ey+104)
257 __mul_64x256_to_320 (ARS0, MY, C256);
258 __mul_64x320_to_384 (ARS, MY, ARS0);
259
260 // shr by k=(2*ey+104)-128
261 // expect k is in the range (192, 256) if result in [10^33, 10^34)
262 // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
263 k = (ey << 1) + 104 - 128 - 192;
264 k2 = 64 - k;
265 ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
266 ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
267 ES.w[1] = ((SINT64) ES.w[1]) >> 1;
268
269 // A*RS >> 192 (for error term computation)
270 ARS1.w[0] = ARS0.w[3];
271 ARS1.w[1] = ARS0.w[4];
272
273 // A*RS>>64
274 ARS00.w[0] = ARS0.w[1];
275 ARS00.w[1] = ARS0.w[2];
276 ARS00.w[2] = ARS0.w[3];
277 ARS00.w[3] = ARS0.w[4];
278
279 if (((SINT64) ES.w[1]) < 0) {
280 ES.w[0] = -ES.w[0];
281 ES.w[1] = -ES.w[1];
282 if (ES.w[0])
283 ES.w[1]--;
284
285 // A*RS*eps
286 __mul_128x128_to_256 (AE, ES, ARS1);
287
288 __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
289 __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
290 __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
291 S.w[3] = ARS00.w[3] + AE.w[3] + CY;
292 } else {
293 // A*RS*eps
294 __mul_128x128_to_256 (AE, ES, ARS1);
295
296 __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
297 __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
298 __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
299 S.w[3] = ARS00.w[3] - AE.w[3] - CY;
300 }
301
302 // 3/2*eps^2, scaled by 2^128
303 ES32 = ES.w[1] + (ES.w[1] >> 1);
304 __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
305 // A*RS*3/2*eps^2
306 __mul_128x128_to_256 (AE2, ES2, ARS1);
307
308 // result, scaled by 2^(ey+52-64)
309 __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
310 __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
311 __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
312 S.w[3] = S.w[3] + AE2.w[3] + CY;
313
314 // k in (0, 64)
315 k = ey + 51 - 128;
316 k2 = 64 - k;
317 S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
318 S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
319
320 // round to nearest
321 S.w[0]++;
322 if (!S.w[0])
323 S.w[1]++;
324
325 pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
326 pCS->w[1] = S.w[1] >> 1;
327
328 }
329
330 #endif
331 #endif