]>
Commit | Line | Data |
---|---|---|
e4d82761 UD |
1 | /* |
2 | * IBM Accurate Mathematical Library | |
aeb25823 | 3 | * written by International Business Machines Corp. |
b168057a | 4 | * Copyright (C) 2001-2015 Free Software Foundation, Inc. |
e4d82761 UD |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU Lesser General Public License as published by | |
cc7375ce | 8 | * the Free Software Foundation; either version 2.1 of the License, or |
e4d82761 | 9 | * (at your option) any later version. |
50944bca | 10 | * |
e4d82761 UD |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
c6c6dd48 | 14 | * GNU Lesser General Public License for more details. |
e4d82761 UD |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public License | |
59ba27a6 | 17 | * along with this program; if not, see <http://www.gnu.org/licenses/>. |
e4d82761 UD |
18 | */ |
19 | /************************************************************************/ | |
20 | /* MODULE_NAME: mpa.c */ | |
21 | /* */ | |
22 | /* FUNCTIONS: */ | |
23 | /* mcr */ | |
24 | /* acr */ | |
e4d82761 | 25 | /* cpy */ |
e4d82761 UD |
26 | /* norm */ |
27 | /* denorm */ | |
28 | /* mp_dbl */ | |
29 | /* dbl_mp */ | |
30 | /* add_magnitudes */ | |
31 | /* sub_magnitudes */ | |
32 | /* add */ | |
33 | /* sub */ | |
34 | /* mul */ | |
35 | /* inv */ | |
36 | /* dvd */ | |
37 | /* */ | |
38 | /* Arithmetic functions for multiple precision numbers. */ | |
39 | /* Relative errors are bounded */ | |
40 | /************************************************************************/ | |
41 | ||
42 | ||
43 | #include "endian.h" | |
44 | #include "mpa.h" | |
950c99ca | 45 | #include <sys/param.h> |
45f05884 | 46 | #include <alloca.h> |
31d3cc00 UD |
47 | |
48 | #ifndef SECTION | |
49 | # define SECTION | |
50 | #endif | |
51 | ||
b76eb5f0 | 52 | #ifndef NO__CONST |
107a5bf0 JM |
53 | const mp_no __mpone = { 1, { 1.0, 1.0 } }; |
54 | const mp_no __mptwo = { 1, { 1.0, 2.0 } }; | |
b76eb5f0 SP |
55 | #endif |
56 | ||
31d3cc00 | 57 | #ifndef NO___ACR |
950c99ca SP |
58 | /* Compare mantissa of two multiple precision numbers regardless of the sign |
59 | and exponent of the numbers. */ | |
31d3cc00 | 60 | static int |
1066a534 SP |
61 | mcr (const mp_no *x, const mp_no *y, int p) |
62 | { | |
e69804d1 SP |
63 | long i; |
64 | long p2 = p; | |
65 | for (i = 1; i <= p2; i++) | |
1066a534 SP |
66 | { |
67 | if (X[i] == Y[i]) | |
68 | continue; | |
69 | else if (X[i] > Y[i]) | |
70 | return 1; | |
71 | else | |
72 | return -1; | |
73 | } | |
e4d82761 UD |
74 | return 0; |
75 | } | |
76 | ||
950c99ca | 77 | /* Compare the absolute values of two multiple precision numbers. */ |
31d3cc00 | 78 | int |
1066a534 SP |
79 | __acr (const mp_no *x, const mp_no *y, int p) |
80 | { | |
e69804d1 | 81 | long i; |
e4d82761 | 82 | |
5739f705 | 83 | if (X[0] == 0) |
1066a534 | 84 | { |
5739f705 | 85 | if (Y[0] == 0) |
1066a534 SP |
86 | i = 0; |
87 | else | |
88 | i = -1; | |
89 | } | |
5739f705 | 90 | else if (Y[0] == 0) |
1066a534 SP |
91 | i = 1; |
92 | else | |
93 | { | |
94 | if (EX > EY) | |
95 | i = 1; | |
96 | else if (EX < EY) | |
97 | i = -1; | |
98 | else | |
99 | i = mcr (x, y, p); | |
100 | } | |
e4d82761 UD |
101 | |
102 | return i; | |
103 | } | |
31d3cc00 | 104 | #endif |
e4d82761 | 105 | |
af968f62 | 106 | #ifndef NO___CPY |
950c99ca SP |
107 | /* Copy multiple precision number X into Y. They could be the same |
108 | number. */ | |
1066a534 SP |
109 | void |
110 | __cpy (const mp_no *x, mp_no *y, int p) | |
111 | { | |
e69804d1 SP |
112 | long i; |
113 | ||
e4d82761 | 114 | EY = EX; |
e69804d1 | 115 | for (i = 0; i <= p; i++) |
1066a534 | 116 | Y[i] = X[i]; |
e4d82761 | 117 | } |
af968f62 | 118 | #endif |
e4d82761 | 119 | |
af968f62 | 120 | #ifndef NO___MP_DBL |
950c99ca SP |
121 | /* Convert a multiple precision number *X into a double precision |
122 | number *Y, normalized case (|x| >= 2**(-1022))). */ | |
1066a534 SP |
123 | static void |
124 | norm (const mp_no *x, double *y, int p) | |
50944bca | 125 | { |
c5d5d574 | 126 | # define R RADIXI |
e69804d1 | 127 | long i; |
6f2e90e7 SP |
128 | double c; |
129 | mantissa_t a, u, v, z[5]; | |
1066a534 SP |
130 | if (p < 5) |
131 | { | |
132 | if (p == 1) | |
133 | c = X[1]; | |
134 | else if (p == 2) | |
135 | c = X[1] + R * X[2]; | |
136 | else if (p == 3) | |
137 | c = X[1] + R * (X[2] + R * X[3]); | |
138 | else if (p == 4) | |
139 | c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]); | |
e4d82761 | 140 | } |
1066a534 SP |
141 | else |
142 | { | |
c5d5d574 | 143 | for (a = 1, z[1] = X[1]; z[1] < TWO23; ) |
1066a534 | 144 | { |
5739f705 SP |
145 | a *= 2; |
146 | z[1] *= 2; | |
1066a534 | 147 | } |
e4d82761 | 148 | |
1066a534 SP |
149 | for (i = 2; i < 5; i++) |
150 | { | |
6f2e90e7 SP |
151 | mantissa_store_t d, r; |
152 | d = X[i] * (mantissa_store_t) a; | |
153 | DIV_RADIX (d, r); | |
154 | z[i] = r; | |
155 | z[i - 1] += d; | |
1066a534 SP |
156 | } |
157 | ||
6f2e90e7 | 158 | u = ALIGN_DOWN_TO (z[3], TWO19); |
1066a534 SP |
159 | v = z[3] - u; |
160 | ||
161 | if (v == TWO18) | |
162 | { | |
5739f705 | 163 | if (z[4] == 0) |
1066a534 SP |
164 | { |
165 | for (i = 5; i <= p; i++) | |
166 | { | |
5739f705 | 167 | if (X[i] == 0) |
1066a534 SP |
168 | continue; |
169 | else | |
170 | { | |
5739f705 | 171 | z[3] += 1; |
1066a534 SP |
172 | break; |
173 | } | |
174 | } | |
175 | } | |
176 | else | |
5739f705 | 177 | z[3] += 1; |
1066a534 SP |
178 | } |
179 | ||
180 | c = (z[1] + R * (z[2] + R * z[3])) / a; | |
181 | } | |
e4d82761 UD |
182 | |
183 | c *= X[0]; | |
184 | ||
1066a534 SP |
185 | for (i = 1; i < EX; i++) |
186 | c *= RADIX; | |
187 | for (i = 1; i > EX; i--) | |
188 | c *= RADIXI; | |
e4d82761 UD |
189 | |
190 | *y = c; | |
c5d5d574 | 191 | # undef R |
e4d82761 UD |
192 | } |
193 | ||
950c99ca SP |
194 | /* Convert a multiple precision number *X into a double precision |
195 | number *Y, Denormal case (|x| < 2**(-1022))). */ | |
1066a534 SP |
196 | static void |
197 | denorm (const mp_no *x, double *y, int p) | |
50944bca | 198 | { |
e69804d1 SP |
199 | long i, k; |
200 | long p2 = p; | |
6f2e90e7 SP |
201 | double c; |
202 | mantissa_t u, z[5]; | |
e4d82761 | 203 | |
c5d5d574 | 204 | # define R RADIXI |
1066a534 SP |
205 | if (EX < -44 || (EX == -44 && X[1] < TWO5)) |
206 | { | |
5739f705 | 207 | *y = 0; |
1066a534 SP |
208 | return; |
209 | } | |
210 | ||
e69804d1 | 211 | if (p2 == 1) |
1066a534 SP |
212 | { |
213 | if (EX == -42) | |
214 | { | |
215 | z[1] = X[1] + TWO10; | |
5739f705 SP |
216 | z[2] = 0; |
217 | z[3] = 0; | |
1066a534 SP |
218 | k = 3; |
219 | } | |
220 | else if (EX == -43) | |
221 | { | |
222 | z[1] = TWO10; | |
223 | z[2] = X[1]; | |
5739f705 | 224 | z[3] = 0; |
1066a534 SP |
225 | k = 2; |
226 | } | |
227 | else | |
228 | { | |
229 | z[1] = TWO10; | |
5739f705 | 230 | z[2] = 0; |
1066a534 SP |
231 | z[3] = X[1]; |
232 | k = 1; | |
233 | } | |
234 | } | |
e69804d1 | 235 | else if (p2 == 2) |
1066a534 SP |
236 | { |
237 | if (EX == -42) | |
238 | { | |
239 | z[1] = X[1] + TWO10; | |
240 | z[2] = X[2]; | |
5739f705 | 241 | z[3] = 0; |
1066a534 SP |
242 | k = 3; |
243 | } | |
244 | else if (EX == -43) | |
245 | { | |
246 | z[1] = TWO10; | |
247 | z[2] = X[1]; | |
248 | z[3] = X[2]; | |
249 | k = 2; | |
250 | } | |
251 | else | |
252 | { | |
253 | z[1] = TWO10; | |
5739f705 | 254 | z[2] = 0; |
1066a534 SP |
255 | z[3] = X[1]; |
256 | k = 1; | |
257 | } | |
258 | } | |
259 | else | |
260 | { | |
261 | if (EX == -42) | |
262 | { | |
263 | z[1] = X[1] + TWO10; | |
264 | z[2] = X[2]; | |
265 | k = 3; | |
266 | } | |
267 | else if (EX == -43) | |
268 | { | |
269 | z[1] = TWO10; | |
270 | z[2] = X[1]; | |
271 | k = 2; | |
272 | } | |
273 | else | |
274 | { | |
275 | z[1] = TWO10; | |
5739f705 | 276 | z[2] = 0; |
1066a534 SP |
277 | k = 1; |
278 | } | |
279 | z[3] = X[k]; | |
280 | } | |
e4d82761 | 281 | |
6f2e90e7 | 282 | u = ALIGN_DOWN_TO (z[3], TWO5); |
e4d82761 | 283 | |
1066a534 SP |
284 | if (u == z[3]) |
285 | { | |
e69804d1 | 286 | for (i = k + 1; i <= p2; i++) |
1066a534 | 287 | { |
5739f705 | 288 | if (X[i] == 0) |
1066a534 SP |
289 | continue; |
290 | else | |
291 | { | |
5739f705 | 292 | z[3] += 1; |
1066a534 SP |
293 | break; |
294 | } | |
295 | } | |
e4d82761 | 296 | } |
e4d82761 | 297 | |
1066a534 | 298 | c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10); |
e4d82761 | 299 | |
1066a534 | 300 | *y = c * TWOM1032; |
c5d5d574 | 301 | # undef R |
e4d82761 UD |
302 | } |
303 | ||
950c99ca SP |
304 | /* Convert multiple precision number *X into double precision number *Y. The |
305 | result is correctly rounded to the nearest/even. */ | |
1066a534 SP |
306 | void |
307 | __mp_dbl (const mp_no *x, double *y, int p) | |
308 | { | |
5739f705 | 309 | if (X[0] == 0) |
1066a534 | 310 | { |
5739f705 | 311 | *y = 0; |
1066a534 SP |
312 | return; |
313 | } | |
e4d82761 | 314 | |
18ea052c | 315 | if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10))) |
1066a534 | 316 | norm (x, y, p); |
18ea052c | 317 | else |
1066a534 | 318 | denorm (x, y, p); |
e4d82761 | 319 | } |
af968f62 | 320 | #endif |
e4d82761 | 321 | |
950c99ca SP |
322 | /* Get the multiple precision equivalent of X into *Y. If the precision is too |
323 | small, the result is truncated. */ | |
31d3cc00 UD |
324 | void |
325 | SECTION | |
1066a534 SP |
326 | __dbl_mp (double x, mp_no *y, int p) |
327 | { | |
e69804d1 SP |
328 | long i, n; |
329 | long p2 = p; | |
e4d82761 | 330 | |
950c99ca | 331 | /* Sign. */ |
5739f705 | 332 | if (x == 0) |
1066a534 | 333 | { |
5739f705 | 334 | Y[0] = 0; |
1066a534 SP |
335 | return; |
336 | } | |
5739f705 SP |
337 | else if (x > 0) |
338 | Y[0] = 1; | |
1066a534 SP |
339 | else |
340 | { | |
5739f705 | 341 | Y[0] = -1; |
1066a534 SP |
342 | x = -x; |
343 | } | |
e4d82761 | 344 | |
950c99ca | 345 | /* Exponent. */ |
5739f705 | 346 | for (EY = 1; x >= RADIX; EY += 1) |
1066a534 | 347 | x *= RADIXI; |
5739f705 | 348 | for (; x < 1; EY -= 1) |
1066a534 | 349 | x *= RADIX; |
e4d82761 | 350 | |
950c99ca | 351 | /* Digits. */ |
e69804d1 | 352 | n = MIN (p2, 4); |
1066a534 SP |
353 | for (i = 1; i <= n; i++) |
354 | { | |
6f2e90e7 | 355 | INTEGER_OF (x, Y[i]); |
1066a534 SP |
356 | x *= RADIX; |
357 | } | |
e69804d1 | 358 | for (; i <= p2; i++) |
5739f705 | 359 | Y[i] = 0; |
e4d82761 UD |
360 | } |
361 | ||
950c99ca SP |
362 | /* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The |
363 | sign of the sum *Z is not changed. X and Y may overlap but not X and Z or | |
364 | Y and Z. No guard digit is used. The result equals the exact sum, | |
365 | truncated. */ | |
31d3cc00 UD |
366 | static void |
367 | SECTION | |
1066a534 SP |
368 | add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
369 | { | |
e69804d1 SP |
370 | long i, j, k; |
371 | long p2 = p; | |
6f2e90e7 | 372 | mantissa_t zk; |
e4d82761 UD |
373 | |
374 | EZ = EX; | |
375 | ||
e69804d1 SP |
376 | i = p2; |
377 | j = p2 + EY - EX; | |
378 | k = p2 + 1; | |
1066a534 | 379 | |
bdf02814 | 380 | if (__glibc_unlikely (j < 1)) |
1066a534 SP |
381 | { |
382 | __cpy (x, z, p); | |
383 | return; | |
384 | } | |
bdf02814 | 385 | |
5739f705 | 386 | zk = 0; |
1066a534 SP |
387 | |
388 | for (; j > 0; i--, j--) | |
389 | { | |
bdf02814 SP |
390 | zk += X[i] + Y[j]; |
391 | if (zk >= RADIX) | |
1066a534 | 392 | { |
bdf02814 | 393 | Z[k--] = zk - RADIX; |
5739f705 | 394 | zk = 1; |
1066a534 SP |
395 | } |
396 | else | |
c5d5d574 | 397 | { |
bdf02814 | 398 | Z[k--] = zk; |
5739f705 | 399 | zk = 0; |
bdf02814 | 400 | } |
1066a534 SP |
401 | } |
402 | ||
403 | for (; i > 0; i--) | |
404 | { | |
bdf02814 SP |
405 | zk += X[i]; |
406 | if (zk >= RADIX) | |
1066a534 | 407 | { |
bdf02814 | 408 | Z[k--] = zk - RADIX; |
5739f705 | 409 | zk = 1; |
1066a534 SP |
410 | } |
411 | else | |
c5d5d574 | 412 | { |
bdf02814 | 413 | Z[k--] = zk; |
5739f705 | 414 | zk = 0; |
bdf02814 | 415 | } |
1066a534 SP |
416 | } |
417 | ||
5739f705 | 418 | if (zk == 0) |
1066a534 | 419 | { |
e69804d1 | 420 | for (i = 1; i <= p2; i++) |
1066a534 SP |
421 | Z[i] = Z[i + 1]; |
422 | } | |
423 | else | |
bdf02814 SP |
424 | { |
425 | Z[1] = zk; | |
5739f705 | 426 | EZ += 1; |
bdf02814 | 427 | } |
e4d82761 UD |
428 | } |
429 | ||
950c99ca SP |
430 | /* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0. |
431 | The sign of the difference *Z is not changed. X and Y may overlap but not X | |
432 | and Z or Y and Z. One guard digit is used. The error is less than one | |
433 | ULP. */ | |
31d3cc00 UD |
434 | static void |
435 | SECTION | |
1066a534 SP |
436 | sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
437 | { | |
e69804d1 SP |
438 | long i, j, k; |
439 | long p2 = p; | |
6f2e90e7 | 440 | mantissa_t zk; |
e4d82761 UD |
441 | |
442 | EZ = EX; | |
e69804d1 SP |
443 | i = p2; |
444 | j = p2 + EY - EX; | |
445 | k = p2; | |
e4d82761 | 446 | |
bdf02814 SP |
447 | /* Y is too small compared to X, copy X over to the result. */ |
448 | if (__glibc_unlikely (j < 1)) | |
1066a534 | 449 | { |
bdf02814 SP |
450 | __cpy (x, z, p); |
451 | return; | |
1066a534 | 452 | } |
bdf02814 SP |
453 | |
454 | /* The relevant least significant digit in Y is non-zero, so we factor it in | |
455 | to enhance accuracy. */ | |
5739f705 | 456 | if (j < p2 && Y[j + 1] > 0) |
1066a534 | 457 | { |
bdf02814 | 458 | Z[k + 1] = RADIX - Y[j + 1]; |
5739f705 | 459 | zk = -1; |
1066a534 | 460 | } |
bdf02814 | 461 | else |
5739f705 | 462 | zk = Z[k + 1] = 0; |
1066a534 | 463 | |
bdf02814 | 464 | /* Subtract and borrow. */ |
1066a534 SP |
465 | for (; j > 0; i--, j--) |
466 | { | |
bdf02814 | 467 | zk += (X[i] - Y[j]); |
5739f705 | 468 | if (zk < 0) |
1066a534 | 469 | { |
bdf02814 | 470 | Z[k--] = zk + RADIX; |
5739f705 | 471 | zk = -1; |
1066a534 SP |
472 | } |
473 | else | |
c5d5d574 | 474 | { |
bdf02814 | 475 | Z[k--] = zk; |
5739f705 | 476 | zk = 0; |
bdf02814 | 477 | } |
1066a534 SP |
478 | } |
479 | ||
bdf02814 | 480 | /* We're done with digits from Y, so it's just digits in X. */ |
1066a534 SP |
481 | for (; i > 0; i--) |
482 | { | |
bdf02814 | 483 | zk += X[i]; |
5739f705 | 484 | if (zk < 0) |
1066a534 | 485 | { |
bdf02814 | 486 | Z[k--] = zk + RADIX; |
5739f705 | 487 | zk = -1; |
1066a534 SP |
488 | } |
489 | else | |
c5d5d574 | 490 | { |
bdf02814 | 491 | Z[k--] = zk; |
5739f705 | 492 | zk = 0; |
bdf02814 | 493 | } |
1066a534 SP |
494 | } |
495 | ||
bdf02814 | 496 | /* Normalize. */ |
c5d5d574 OB |
497 | for (i = 1; Z[i] == 0; i++) |
498 | ; | |
e4d82761 | 499 | EZ = EZ - i + 1; |
c5d5d574 | 500 | for (k = 1; i <= p2 + 1; ) |
e4d82761 | 501 | Z[k++] = Z[i++]; |
c5d5d574 | 502 | for (; k <= p2; ) |
5739f705 | 503 | Z[k++] = 0; |
e4d82761 UD |
504 | } |
505 | ||
950c99ca SP |
506 | /* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X |
507 | and Z or Y and Z. One guard digit is used. The error is less than one | |
508 | ULP. */ | |
31d3cc00 UD |
509 | void |
510 | SECTION | |
1066a534 SP |
511 | __add (const mp_no *x, const mp_no *y, mp_no *z, int p) |
512 | { | |
e4d82761 UD |
513 | int n; |
514 | ||
5739f705 | 515 | if (X[0] == 0) |
1066a534 SP |
516 | { |
517 | __cpy (y, z, p); | |
518 | return; | |
519 | } | |
5739f705 | 520 | else if (Y[0] == 0) |
1066a534 SP |
521 | { |
522 | __cpy (x, z, p); | |
523 | return; | |
524 | } | |
525 | ||
526 | if (X[0] == Y[0]) | |
527 | { | |
528 | if (__acr (x, y, p) > 0) | |
529 | { | |
530 | add_magnitudes (x, y, z, p); | |
531 | Z[0] = X[0]; | |
532 | } | |
533 | else | |
534 | { | |
535 | add_magnitudes (y, x, z, p); | |
536 | Z[0] = Y[0]; | |
537 | } | |
538 | } | |
539 | else | |
540 | { | |
541 | if ((n = __acr (x, y, p)) == 1) | |
542 | { | |
543 | sub_magnitudes (x, y, z, p); | |
544 | Z[0] = X[0]; | |
545 | } | |
546 | else if (n == -1) | |
547 | { | |
548 | sub_magnitudes (y, x, z, p); | |
549 | Z[0] = Y[0]; | |
550 | } | |
551 | else | |
5739f705 | 552 | Z[0] = 0; |
1066a534 | 553 | } |
e4d82761 UD |
554 | } |
555 | ||
950c99ca SP |
556 | /* Subtract *Y from *X and return the result in *Z. X and Y may overlap but |
557 | not X and Z or Y and Z. One guard digit is used. The error is less than | |
558 | one ULP. */ | |
31d3cc00 UD |
559 | void |
560 | SECTION | |
1066a534 SP |
561 | __sub (const mp_no *x, const mp_no *y, mp_no *z, int p) |
562 | { | |
e4d82761 UD |
563 | int n; |
564 | ||
5739f705 | 565 | if (X[0] == 0) |
1066a534 SP |
566 | { |
567 | __cpy (y, z, p); | |
568 | Z[0] = -Z[0]; | |
569 | return; | |
570 | } | |
5739f705 | 571 | else if (Y[0] == 0) |
1066a534 SP |
572 | { |
573 | __cpy (x, z, p); | |
574 | return; | |
575 | } | |
576 | ||
577 | if (X[0] != Y[0]) | |
578 | { | |
579 | if (__acr (x, y, p) > 0) | |
580 | { | |
581 | add_magnitudes (x, y, z, p); | |
582 | Z[0] = X[0]; | |
583 | } | |
584 | else | |
585 | { | |
586 | add_magnitudes (y, x, z, p); | |
587 | Z[0] = -Y[0]; | |
588 | } | |
589 | } | |
590 | else | |
591 | { | |
592 | if ((n = __acr (x, y, p)) == 1) | |
593 | { | |
594 | sub_magnitudes (x, y, z, p); | |
595 | Z[0] = X[0]; | |
596 | } | |
597 | else if (n == -1) | |
598 | { | |
599 | sub_magnitudes (y, x, z, p); | |
600 | Z[0] = -Y[0]; | |
601 | } | |
602 | else | |
5739f705 | 603 | Z[0] = 0; |
1066a534 | 604 | } |
e4d82761 UD |
605 | } |
606 | ||
82a9811d | 607 | #ifndef NO__MUL |
950c99ca SP |
608 | /* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X |
609 | and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P | |
610 | digits. In case P > 3 the error is bounded by 1.001 ULP. */ | |
31d3cc00 UD |
611 | void |
612 | SECTION | |
1066a534 SP |
613 | __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) |
614 | { | |
e69804d1 SP |
615 | long i, j, k, ip, ip2; |
616 | long p2 = p; | |
6f2e90e7 | 617 | mantissa_store_t zk; |
2d0e0f29 | 618 | const mp_no *a; |
6f2e90e7 | 619 | mantissa_store_t *diag; |
e4d82761 | 620 | |
44e0d4c2 | 621 | /* Is z=0? */ |
5739f705 | 622 | if (__glibc_unlikely (X[0] * Y[0] == 0)) |
44e0d4c2 | 623 | { |
5739f705 | 624 | Z[0] = 0; |
44e0d4c2 SP |
625 | return; |
626 | } | |
e4d82761 | 627 | |
909279a5 SP |
628 | /* We need not iterate through all X's and Y's since it's pointless to |
629 | multiply zeroes. Here, both are zero... */ | |
e69804d1 | 630 | for (ip2 = p2; ip2 > 0; ip2--) |
5739f705 | 631 | if (X[ip2] != 0 || Y[ip2] != 0) |
909279a5 | 632 | break; |
44e0d4c2 | 633 | |
5739f705 | 634 | a = X[ip2] != 0 ? y : x; |
2d0e0f29 | 635 | |
909279a5 SP |
636 | /* ... and here, at least one of them is still zero. */ |
637 | for (ip = ip2; ip > 0; ip--) | |
5739f705 | 638 | if (a->d[ip] != 0) |
909279a5 SP |
639 | break; |
640 | ||
641 | /* The product looks like this for p = 3 (as an example): | |
642 | ||
643 | ||
644 | a1 a2 a3 | |
645 | x b1 b2 b3 | |
646 | ----------------------------- | |
647 | a1*b3 a2*b3 a3*b3 | |
648 | a1*b2 a2*b2 a3*b2 | |
649 | a1*b1 a2*b1 a3*b1 | |
650 | ||
651 | So our K needs to ideally be P*2, but we're limiting ourselves to P + 3 | |
652 | for P >= 3. We compute the above digits in two parts; the last P-1 | |
653 | digits and then the first P digits. The last P-1 digits are a sum of | |
654 | products of the input digits from P to P-k where K is 0 for the least | |
655 | significant digit and increases as we go towards the left. The product | |
656 | term is of the form X[k]*X[P-k] as can be seen in the above example. | |
657 | ||
658 | The first P digits are also a sum of products with the same product term, | |
659 | except that the sum is from 1 to k. This is also evident from the above | |
660 | example. | |
661 | ||
662 | Another thing that becomes evident is that only the most significant | |
663 | ip+ip2 digits of the result are non-zero, where ip and ip2 are the | |
664 | 'internal precision' of the input numbers, i.e. digits after ip and ip2 | |
5739f705 | 665 | are all 0. */ |
909279a5 | 666 | |
e69804d1 | 667 | k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3; |
909279a5 SP |
668 | |
669 | while (k > ip + ip2 + 1) | |
5739f705 | 670 | Z[k--] = 0; |
909279a5 | 671 | |
5739f705 | 672 | zk = 0; |
45f05884 SP |
673 | |
674 | /* Precompute sums of diagonal elements so that we can directly use them | |
675 | later. See the next comment to know we why need them. */ | |
6f2e90e7 | 676 | diag = alloca (k * sizeof (mantissa_store_t)); |
5739f705 | 677 | mantissa_store_t d = 0; |
45f05884 SP |
678 | for (i = 1; i <= ip; i++) |
679 | { | |
6f2e90e7 | 680 | d += X[i] * (mantissa_store_t) Y[i]; |
45f05884 SP |
681 | diag[i] = d; |
682 | } | |
683 | while (i < k) | |
684 | diag[i++] = d; | |
909279a5 | 685 | |
e69804d1 | 686 | while (k > p2) |
44e0d4c2 | 687 | { |
45f05884 SP |
688 | long lim = k / 2; |
689 | ||
690 | if (k % 2 == 0) | |
691 | /* We want to add this only once, but since we subtract it in the sum | |
692 | of products above, we add twice. */ | |
6f2e90e7 | 693 | zk += 2 * X[lim] * (mantissa_store_t) Y[lim]; |
45f05884 SP |
694 | |
695 | for (i = k - p2, j = p2; i < j; i++, j--) | |
6f2e90e7 | 696 | zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]); |
45f05884 SP |
697 | |
698 | zk -= diag[k - 1]; | |
44e0d4c2 | 699 | |
6f2e90e7 SP |
700 | DIV_RADIX (zk, Z[k]); |
701 | k--; | |
44e0d4c2 SP |
702 | } |
703 | ||
45f05884 SP |
704 | /* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i |
705 | goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the | |
706 | number of multiplications, we halve the range and if k is an even number, | |
707 | add the diagonal element X[k/2]Y[k/2]. Through the half range, we compute | |
708 | X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j]. | |
709 | ||
710 | This reduction tells us that we're summing two things, the first term | |
711 | through the half range and the negative of the sum of the product of all | |
712 | terms of X and Y in the full range. i.e. | |
713 | ||
714 | SUM(X[i] * Y[i]) for k terms. This is precalculated above for each k in | |
715 | a single loop so that it completes in O(n) time and can hence be directly | |
716 | used in the loop below. */ | |
44e0d4c2 SP |
717 | while (k > 1) |
718 | { | |
45f05884 SP |
719 | long lim = k / 2; |
720 | ||
721 | if (k % 2 == 0) | |
722 | /* We want to add this only once, but since we subtract it in the sum | |
723 | of products above, we add twice. */ | |
6f2e90e7 | 724 | zk += 2 * X[lim] * (mantissa_store_t) Y[lim]; |
45f05884 SP |
725 | |
726 | for (i = 1, j = k - 1; i < j; i++, j--) | |
6f2e90e7 | 727 | zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]); |
45f05884 SP |
728 | |
729 | zk -= diag[k - 1]; | |
44e0d4c2 | 730 | |
6f2e90e7 SP |
731 | DIV_RADIX (zk, Z[k]); |
732 | k--; | |
44e0d4c2 | 733 | } |
2a91b573 | 734 | Z[k] = zk; |
44e0d4c2 | 735 | |
4709fe76 SP |
736 | /* Get the exponent sum into an intermediate variable. This is a subtle |
737 | optimization, where given enough registers, all operations on the exponent | |
738 | happen in registers and the result is written out only once into EZ. */ | |
739 | int e = EX + EY; | |
740 | ||
950c99ca | 741 | /* Is there a carry beyond the most significant digit? */ |
5739f705 | 742 | if (__glibc_unlikely (Z[1] == 0)) |
44e0d4c2 | 743 | { |
e69804d1 | 744 | for (i = 1; i <= p2; i++) |
1066a534 | 745 | Z[i] = Z[i + 1]; |
4709fe76 | 746 | e--; |
44e0d4c2 | 747 | } |
e4d82761 | 748 | |
4709fe76 | 749 | EZ = e; |
e4d82761 | 750 | Z[0] = X[0] * Y[0]; |
e4d82761 | 751 | } |
82a9811d | 752 | #endif |
e4d82761 | 753 | |
82a9811d | 754 | #ifndef NO__SQR |
d6752ccd SP |
755 | /* Square *X and store result in *Y. X and Y may not overlap. For P in |
756 | [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the | |
757 | error is bounded by 1.001 ULP. This is a faster special case of | |
758 | multiplication. */ | |
759 | void | |
760 | SECTION | |
761 | __sqr (const mp_no *x, mp_no *y, int p) | |
762 | { | |
763 | long i, j, k, ip; | |
6f2e90e7 | 764 | mantissa_store_t yk; |
d6752ccd SP |
765 | |
766 | /* Is z=0? */ | |
5739f705 | 767 | if (__glibc_unlikely (X[0] == 0)) |
d6752ccd | 768 | { |
5739f705 | 769 | Y[0] = 0; |
d6752ccd SP |
770 | return; |
771 | } | |
772 | ||
773 | /* We need not iterate through all X's since it's pointless to | |
774 | multiply zeroes. */ | |
775 | for (ip = p; ip > 0; ip--) | |
5739f705 | 776 | if (X[ip] != 0) |
d6752ccd SP |
777 | break; |
778 | ||
779 | k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; | |
780 | ||
781 | while (k > 2 * ip + 1) | |
5739f705 | 782 | Y[k--] = 0; |
d6752ccd | 783 | |
5739f705 | 784 | yk = 0; |
d6752ccd SP |
785 | |
786 | while (k > p) | |
787 | { | |
6f2e90e7 | 788 | mantissa_store_t yk2 = 0; |
d6752ccd SP |
789 | long lim = k / 2; |
790 | ||
791 | if (k % 2 == 0) | |
6f2e90e7 | 792 | yk += X[lim] * (mantissa_store_t) X[lim]; |
d6752ccd | 793 | |
20cd7fb3 SP |
794 | /* In __mul, this loop (and the one within the next while loop) run |
795 | between a range to calculate the mantissa as follows: | |
796 | ||
797 | Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] | |
798 | + X[n] * Y[k] | |
799 | ||
800 | For X == Y, we can get away with summing halfway and doubling the | |
801 | result. For cases where the range size is even, the mid-point needs | |
802 | to be added separately (above). */ | |
22af19f9 | 803 | for (i = k - p, j = p; i < j; i++, j--) |
6f2e90e7 | 804 | yk2 += X[i] * (mantissa_store_t) X[j]; |
d6752ccd | 805 | |
5739f705 | 806 | yk += 2 * yk2; |
d6752ccd | 807 | |
6f2e90e7 SP |
808 | DIV_RADIX (yk, Y[k]); |
809 | k--; | |
d6752ccd SP |
810 | } |
811 | ||
812 | while (k > 1) | |
813 | { | |
6f2e90e7 | 814 | mantissa_store_t yk2 = 0; |
d6752ccd SP |
815 | long lim = k / 2; |
816 | ||
817 | if (k % 2 == 0) | |
6f2e90e7 | 818 | yk += X[lim] * (mantissa_store_t) X[lim]; |
d6752ccd | 819 | |
20cd7fb3 | 820 | /* Likewise for this loop. */ |
22af19f9 | 821 | for (i = 1, j = k - 1; i < j; i++, j--) |
6f2e90e7 | 822 | yk2 += X[i] * (mantissa_store_t) X[j]; |
d6752ccd | 823 | |
5739f705 | 824 | yk += 2 * yk2; |
d6752ccd | 825 | |
6f2e90e7 SP |
826 | DIV_RADIX (yk, Y[k]); |
827 | k--; | |
d6752ccd SP |
828 | } |
829 | Y[k] = yk; | |
830 | ||
831 | /* Squares are always positive. */ | |
5739f705 | 832 | Y[0] = 1; |
d6752ccd | 833 | |
4709fe76 SP |
834 | /* Get the exponent sum into an intermediate variable. This is a subtle |
835 | optimization, where given enough registers, all operations on the exponent | |
836 | happen in registers and the result is written out only once into EZ. */ | |
837 | int e = EX * 2; | |
838 | ||
d6752ccd | 839 | /* Is there a carry beyond the most significant digit? */ |
5739f705 | 840 | if (__glibc_unlikely (Y[1] == 0)) |
d6752ccd SP |
841 | { |
842 | for (i = 1; i <= p; i++) | |
843 | Y[i] = Y[i + 1]; | |
4709fe76 | 844 | e--; |
d6752ccd | 845 | } |
4709fe76 SP |
846 | |
847 | EY = e; | |
d6752ccd | 848 | } |
82a9811d | 849 | #endif |
d6752ccd | 850 | |
950c99ca SP |
851 | /* Invert *X and store in *Y. Relative error bound: |
852 | - For P = 2: 1.001 * R ^ (1 - P) | |
853 | - For P = 3: 1.063 * R ^ (1 - P) | |
854 | - For P > 3: 2.001 * R ^ (1 - P) | |
e4d82761 | 855 | |
950c99ca | 856 | *X = 0 is not permissible. */ |
1066a534 | 857 | static void |
31d3cc00 | 858 | SECTION |
1066a534 SP |
859 | __inv (const mp_no *x, mp_no *y, int p) |
860 | { | |
e69804d1 | 861 | long i; |
e4d82761 | 862 | double t; |
1066a534 SP |
863 | mp_no z, w; |
864 | static const int np1[] = | |
865 | { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
866 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
867 | }; | |
868 | ||
869 | __cpy (x, &z, p); | |
870 | z.e = 0; | |
871 | __mp_dbl (&z, &t, p); | |
5739f705 | 872 | t = 1 / t; |
1066a534 SP |
873 | __dbl_mp (t, y, p); |
874 | EY -= EX; | |
875 | ||
876 | for (i = 0; i < np1[p]; i++) | |
877 | { | |
878 | __cpy (y, &w, p); | |
879 | __mul (x, &w, y, p); | |
107a5bf0 | 880 | __sub (&__mptwo, y, &z, p); |
1066a534 SP |
881 | __mul (&w, &z, y, p); |
882 | } | |
e4d82761 UD |
883 | } |
884 | ||
950c99ca SP |
885 | /* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z |
886 | or Y and Z. Relative error bound: | |
887 | - For P = 2: 2.001 * R ^ (1 - P) | |
888 | - For P = 3: 2.063 * R ^ (1 - P) | |
889 | - For P > 3: 3.001 * R ^ (1 - P) | |
e4d82761 | 890 | |
950c99ca | 891 | *X = 0 is not permissible. */ |
31d3cc00 UD |
892 | void |
893 | SECTION | |
1066a534 SP |
894 | __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p) |
895 | { | |
e4d82761 UD |
896 | mp_no w; |
897 | ||
5739f705 SP |
898 | if (X[0] == 0) |
899 | Z[0] = 0; | |
1066a534 SP |
900 | else |
901 | { | |
902 | __inv (y, &w, p); | |
903 | __mul (x, &w, z, p); | |
904 | } | |
e4d82761 | 905 | } |