]>
Commit | Line | Data |
---|---|---|
e4d82761 UD |
1 | /* |
2 | * IBM Accurate Mathematical Library | |
aeb25823 | 3 | * written by International Business Machines Corp. |
568035b7 | 4 | * Copyright (C) 2001-2013 Free Software Foundation, Inc. |
e4d82761 UD |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU Lesser General Public License as published by | |
cc7375ce | 8 | * the Free Software Foundation; either version 2.1 of the License, or |
e4d82761 | 9 | * (at your option) any later version. |
50944bca | 10 | * |
e4d82761 UD |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
c6c6dd48 | 14 | * GNU Lesser General Public License for more details. |
e4d82761 UD |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public License | |
59ba27a6 | 17 | * along with this program; if not, see <http://www.gnu.org/licenses/>. |
e4d82761 UD |
18 | */ |
19 | /************************************************************************/ | |
20 | /* MODULE_NAME: mpa.c */ | |
21 | /* */ | |
22 | /* FUNCTIONS: */ | |
23 | /* mcr */ | |
24 | /* acr */ | |
e4d82761 | 25 | /* cpy */ |
e4d82761 UD |
26 | /* norm */ |
27 | /* denorm */ | |
28 | /* mp_dbl */ | |
29 | /* dbl_mp */ | |
30 | /* add_magnitudes */ | |
31 | /* sub_magnitudes */ | |
32 | /* add */ | |
33 | /* sub */ | |
34 | /* mul */ | |
35 | /* inv */ | |
36 | /* dvd */ | |
37 | /* */ | |
38 | /* Arithmetic functions for multiple precision numbers. */ | |
39 | /* Relative errors are bounded */ | |
40 | /************************************************************************/ | |
41 | ||
42 | ||
43 | #include "endian.h" | |
44 | #include "mpa.h" | |
950c99ca | 45 | #include <sys/param.h> |
31d3cc00 UD |
46 | |
47 | #ifndef SECTION | |
48 | # define SECTION | |
49 | #endif | |
50 | ||
b76eb5f0 SP |
51 | #ifndef NO__CONST |
52 | const mp_no mpone = {1, {1.0, 1.0}}; | |
da08f647 | 53 | const mp_no mptwo = {1, {1.0, 2.0}}; |
b76eb5f0 SP |
54 | #endif |
55 | ||
31d3cc00 | 56 | #ifndef NO___ACR |
950c99ca SP |
57 | /* Compare mantissa of two multiple precision numbers regardless of the sign |
58 | and exponent of the numbers. */ | |
31d3cc00 | 59 | static int |
1066a534 SP |
60 | mcr (const mp_no *x, const mp_no *y, int p) |
61 | { | |
e4d82761 | 62 | int i; |
1066a534 SP |
63 | for (i = 1; i <= p; i++) |
64 | { | |
65 | if (X[i] == Y[i]) | |
66 | continue; | |
67 | else if (X[i] > Y[i]) | |
68 | return 1; | |
69 | else | |
70 | return -1; | |
71 | } | |
e4d82761 UD |
72 | return 0; |
73 | } | |
74 | ||
950c99ca | 75 | /* Compare the absolute values of two multiple precision numbers. */ |
31d3cc00 | 76 | int |
1066a534 SP |
77 | __acr (const mp_no *x, const mp_no *y, int p) |
78 | { | |
e4d82761 UD |
79 | int i; |
80 | ||
1066a534 SP |
81 | if (X[0] == ZERO) |
82 | { | |
83 | if (Y[0] == ZERO) | |
84 | i = 0; | |
85 | else | |
86 | i = -1; | |
87 | } | |
88 | else if (Y[0] == ZERO) | |
89 | i = 1; | |
90 | else | |
91 | { | |
92 | if (EX > EY) | |
93 | i = 1; | |
94 | else if (EX < EY) | |
95 | i = -1; | |
96 | else | |
97 | i = mcr (x, y, p); | |
98 | } | |
e4d82761 UD |
99 | |
100 | return i; | |
101 | } | |
31d3cc00 | 102 | #endif |
e4d82761 | 103 | |
af968f62 | 104 | #ifndef NO___CPY |
950c99ca SP |
105 | /* Copy multiple precision number X into Y. They could be the same |
106 | number. */ | |
1066a534 SP |
107 | void |
108 | __cpy (const mp_no *x, mp_no *y, int p) | |
109 | { | |
e4d82761 | 110 | EY = EX; |
1066a534 SP |
111 | for (int i = 0; i <= p; i++) |
112 | Y[i] = X[i]; | |
e4d82761 | 113 | } |
af968f62 | 114 | #endif |
e4d82761 | 115 | |
af968f62 | 116 | #ifndef NO___MP_DBL |
950c99ca SP |
117 | /* Convert a multiple precision number *X into a double precision |
118 | number *Y, normalized case (|x| >= 2**(-1022))). */ | |
1066a534 SP |
119 | static void |
120 | norm (const mp_no *x, double *y, int p) | |
50944bca | 121 | { |
1066a534 | 122 | #define R RADIXI |
50944bca | 123 | int i; |
1066a534 SP |
124 | double a, c, u, v, z[5]; |
125 | if (p < 5) | |
126 | { | |
127 | if (p == 1) | |
128 | c = X[1]; | |
129 | else if (p == 2) | |
130 | c = X[1] + R * X[2]; | |
131 | else if (p == 3) | |
132 | c = X[1] + R * (X[2] + R * X[3]); | |
133 | else if (p == 4) | |
134 | c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]); | |
e4d82761 | 135 | } |
1066a534 SP |
136 | else |
137 | { | |
138 | for (a = ONE, z[1] = X[1]; z[1] < TWO23;) | |
139 | { | |
140 | a *= TWO; | |
141 | z[1] *= TWO; | |
142 | } | |
e4d82761 | 143 | |
1066a534 SP |
144 | for (i = 2; i < 5; i++) |
145 | { | |
146 | z[i] = X[i] * a; | |
147 | u = (z[i] + CUTTER) - CUTTER; | |
148 | if (u > z[i]) | |
149 | u -= RADIX; | |
150 | z[i] -= u; | |
151 | z[i - 1] += u * RADIXI; | |
152 | } | |
153 | ||
154 | u = (z[3] + TWO71) - TWO71; | |
155 | if (u > z[3]) | |
156 | u -= TWO19; | |
157 | v = z[3] - u; | |
158 | ||
159 | if (v == TWO18) | |
160 | { | |
161 | if (z[4] == ZERO) | |
162 | { | |
163 | for (i = 5; i <= p; i++) | |
164 | { | |
165 | if (X[i] == ZERO) | |
166 | continue; | |
167 | else | |
168 | { | |
169 | z[3] += ONE; | |
170 | break; | |
171 | } | |
172 | } | |
173 | } | |
174 | else | |
175 | z[3] += ONE; | |
176 | } | |
177 | ||
178 | c = (z[1] + R * (z[2] + R * z[3])) / a; | |
179 | } | |
e4d82761 UD |
180 | |
181 | c *= X[0]; | |
182 | ||
1066a534 SP |
183 | for (i = 1; i < EX; i++) |
184 | c *= RADIX; | |
185 | for (i = 1; i > EX; i--) | |
186 | c *= RADIXI; | |
e4d82761 UD |
187 | |
188 | *y = c; | |
e4d82761 UD |
189 | #undef R |
190 | } | |
191 | ||
950c99ca SP |
192 | /* Convert a multiple precision number *X into a double precision |
193 | number *Y, Denormal case (|x| < 2**(-1022))). */ | |
1066a534 SP |
194 | static void |
195 | denorm (const mp_no *x, double *y, int p) | |
50944bca | 196 | { |
1066a534 SP |
197 | int i, k; |
198 | double c, u, z[5]; | |
e4d82761 | 199 | |
99136f82 | 200 | #define R RADIXI |
1066a534 SP |
201 | if (EX < -44 || (EX == -44 && X[1] < TWO5)) |
202 | { | |
203 | *y = ZERO; | |
204 | return; | |
205 | } | |
206 | ||
207 | if (p == 1) | |
208 | { | |
209 | if (EX == -42) | |
210 | { | |
211 | z[1] = X[1] + TWO10; | |
212 | z[2] = ZERO; | |
213 | z[3] = ZERO; | |
214 | k = 3; | |
215 | } | |
216 | else if (EX == -43) | |
217 | { | |
218 | z[1] = TWO10; | |
219 | z[2] = X[1]; | |
220 | z[3] = ZERO; | |
221 | k = 2; | |
222 | } | |
223 | else | |
224 | { | |
225 | z[1] = TWO10; | |
226 | z[2] = ZERO; | |
227 | z[3] = X[1]; | |
228 | k = 1; | |
229 | } | |
230 | } | |
231 | else if (p == 2) | |
232 | { | |
233 | if (EX == -42) | |
234 | { | |
235 | z[1] = X[1] + TWO10; | |
236 | z[2] = X[2]; | |
237 | z[3] = ZERO; | |
238 | k = 3; | |
239 | } | |
240 | else if (EX == -43) | |
241 | { | |
242 | z[1] = TWO10; | |
243 | z[2] = X[1]; | |
244 | z[3] = X[2]; | |
245 | k = 2; | |
246 | } | |
247 | else | |
248 | { | |
249 | z[1] = TWO10; | |
250 | z[2] = ZERO; | |
251 | z[3] = X[1]; | |
252 | k = 1; | |
253 | } | |
254 | } | |
255 | else | |
256 | { | |
257 | if (EX == -42) | |
258 | { | |
259 | z[1] = X[1] + TWO10; | |
260 | z[2] = X[2]; | |
261 | k = 3; | |
262 | } | |
263 | else if (EX == -43) | |
264 | { | |
265 | z[1] = TWO10; | |
266 | z[2] = X[1]; | |
267 | k = 2; | |
268 | } | |
269 | else | |
270 | { | |
271 | z[1] = TWO10; | |
272 | z[2] = ZERO; | |
273 | k = 1; | |
274 | } | |
275 | z[3] = X[k]; | |
276 | } | |
e4d82761 UD |
277 | |
278 | u = (z[3] + TWO57) - TWO57; | |
1066a534 SP |
279 | if (u > z[3]) |
280 | u -= TWO5; | |
e4d82761 | 281 | |
1066a534 SP |
282 | if (u == z[3]) |
283 | { | |
284 | for (i = k + 1; i <= p; i++) | |
285 | { | |
286 | if (X[i] == ZERO) | |
287 | continue; | |
288 | else | |
289 | { | |
290 | z[3] += ONE; | |
291 | break; | |
292 | } | |
293 | } | |
e4d82761 | 294 | } |
e4d82761 | 295 | |
1066a534 | 296 | c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10); |
e4d82761 | 297 | |
1066a534 | 298 | *y = c * TWOM1032; |
e4d82761 UD |
299 | #undef R |
300 | } | |
301 | ||
950c99ca SP |
302 | /* Convert multiple precision number *X into double precision number *Y. The |
303 | result is correctly rounded to the nearest/even. */ | |
1066a534 SP |
304 | void |
305 | __mp_dbl (const mp_no *x, double *y, int p) | |
306 | { | |
307 | if (X[0] == ZERO) | |
308 | { | |
309 | *y = ZERO; | |
310 | return; | |
311 | } | |
e4d82761 | 312 | |
18ea052c | 313 | if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10))) |
1066a534 | 314 | norm (x, y, p); |
18ea052c | 315 | else |
1066a534 | 316 | denorm (x, y, p); |
e4d82761 | 317 | } |
af968f62 | 318 | #endif |
e4d82761 | 319 | |
950c99ca SP |
320 | /* Get the multiple precision equivalent of X into *Y. If the precision is too |
321 | small, the result is truncated. */ | |
31d3cc00 UD |
322 | void |
323 | SECTION | |
1066a534 SP |
324 | __dbl_mp (double x, mp_no *y, int p) |
325 | { | |
326 | int i, n; | |
e4d82761 UD |
327 | double u; |
328 | ||
950c99ca | 329 | /* Sign. */ |
1066a534 SP |
330 | if (x == ZERO) |
331 | { | |
332 | Y[0] = ZERO; | |
333 | return; | |
334 | } | |
335 | else if (x > ZERO) | |
336 | Y[0] = ONE; | |
337 | else | |
338 | { | |
339 | Y[0] = MONE; | |
340 | x = -x; | |
341 | } | |
e4d82761 | 342 | |
950c99ca | 343 | /* Exponent. */ |
1066a534 SP |
344 | for (EY = ONE; x >= RADIX; EY += ONE) |
345 | x *= RADIXI; | |
346 | for (; x < ONE; EY -= ONE) | |
347 | x *= RADIX; | |
e4d82761 | 348 | |
950c99ca | 349 | /* Digits. */ |
1066a534 SP |
350 | n = MIN (p, 4); |
351 | for (i = 1; i <= n; i++) | |
352 | { | |
353 | u = (x + TWO52) - TWO52; | |
354 | if (u > x) | |
355 | u -= ONE; | |
356 | Y[i] = u; | |
357 | x -= u; | |
358 | x *= RADIX; | |
359 | } | |
360 | for (; i <= p; i++) | |
361 | Y[i] = ZERO; | |
e4d82761 UD |
362 | } |
363 | ||
950c99ca SP |
364 | /* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The |
365 | sign of the sum *Z is not changed. X and Y may overlap but not X and Z or | |
366 | Y and Z. No guard digit is used. The result equals the exact sum, | |
367 | truncated. */ | |
31d3cc00 UD |
368 | static void |
369 | SECTION | |
1066a534 SP |
370 | add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
371 | { | |
372 | int i, j, k; | |
bdf02814 | 373 | double zk; |
e4d82761 UD |
374 | |
375 | EZ = EX; | |
376 | ||
1066a534 SP |
377 | i = p; |
378 | j = p + EY - EX; | |
379 | k = p + 1; | |
380 | ||
bdf02814 | 381 | if (__glibc_unlikely (j < 1)) |
1066a534 SP |
382 | { |
383 | __cpy (x, z, p); | |
384 | return; | |
385 | } | |
bdf02814 SP |
386 | |
387 | zk = ZERO; | |
1066a534 SP |
388 | |
389 | for (; j > 0; i--, j--) | |
390 | { | |
bdf02814 SP |
391 | zk += X[i] + Y[j]; |
392 | if (zk >= RADIX) | |
1066a534 | 393 | { |
bdf02814 SP |
394 | Z[k--] = zk - RADIX; |
395 | zk = ONE; | |
1066a534 SP |
396 | } |
397 | else | |
bdf02814 SP |
398 | { |
399 | Z[k--] = zk; | |
400 | zk = ZERO; | |
401 | } | |
1066a534 SP |
402 | } |
403 | ||
404 | for (; i > 0; i--) | |
405 | { | |
bdf02814 SP |
406 | zk += X[i]; |
407 | if (zk >= RADIX) | |
1066a534 | 408 | { |
bdf02814 SP |
409 | Z[k--] = zk - RADIX; |
410 | zk = ONE; | |
1066a534 SP |
411 | } |
412 | else | |
bdf02814 SP |
413 | { |
414 | Z[k--] = zk; | |
415 | zk = ZERO; | |
416 | } | |
1066a534 SP |
417 | } |
418 | ||
bdf02814 | 419 | if (zk == ZERO) |
1066a534 SP |
420 | { |
421 | for (i = 1; i <= p; i++) | |
422 | Z[i] = Z[i + 1]; | |
423 | } | |
424 | else | |
bdf02814 SP |
425 | { |
426 | Z[1] = zk; | |
427 | EZ += ONE; | |
428 | } | |
e4d82761 UD |
429 | } |
430 | ||
950c99ca SP |
431 | /* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0. |
432 | The sign of the difference *Z is not changed. X and Y may overlap but not X | |
433 | and Z or Y and Z. One guard digit is used. The error is less than one | |
434 | ULP. */ | |
31d3cc00 UD |
435 | static void |
436 | SECTION | |
1066a534 SP |
437 | sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
438 | { | |
439 | int i, j, k; | |
bdf02814 | 440 | double zk; |
e4d82761 UD |
441 | |
442 | EZ = EX; | |
bdf02814 SP |
443 | i = p; |
444 | j = p + EY - EX; | |
445 | k = p; | |
e4d82761 | 446 | |
bdf02814 SP |
447 | /* Y is too small compared to X, copy X over to the result. */ |
448 | if (__glibc_unlikely (j < 1)) | |
1066a534 | 449 | { |
bdf02814 SP |
450 | __cpy (x, z, p); |
451 | return; | |
1066a534 | 452 | } |
bdf02814 SP |
453 | |
454 | /* The relevant least significant digit in Y is non-zero, so we factor it in | |
455 | to enhance accuracy. */ | |
456 | if (j < p && Y[j + 1] > ZERO) | |
1066a534 | 457 | { |
bdf02814 SP |
458 | Z[k + 1] = RADIX - Y[j + 1]; |
459 | zk = MONE; | |
1066a534 | 460 | } |
bdf02814 SP |
461 | else |
462 | zk = Z[k + 1] = ZERO; | |
1066a534 | 463 | |
bdf02814 | 464 | /* Subtract and borrow. */ |
1066a534 SP |
465 | for (; j > 0; i--, j--) |
466 | { | |
bdf02814 SP |
467 | zk += (X[i] - Y[j]); |
468 | if (zk < ZERO) | |
1066a534 | 469 | { |
bdf02814 SP |
470 | Z[k--] = zk + RADIX; |
471 | zk = MONE; | |
1066a534 SP |
472 | } |
473 | else | |
bdf02814 SP |
474 | { |
475 | Z[k--] = zk; | |
476 | zk = ZERO; | |
477 | } | |
1066a534 SP |
478 | } |
479 | ||
bdf02814 | 480 | /* We're done with digits from Y, so it's just digits in X. */ |
1066a534 SP |
481 | for (; i > 0; i--) |
482 | { | |
bdf02814 SP |
483 | zk += X[i]; |
484 | if (zk < ZERO) | |
1066a534 | 485 | { |
bdf02814 SP |
486 | Z[k--] = zk + RADIX; |
487 | zk = MONE; | |
1066a534 SP |
488 | } |
489 | else | |
bdf02814 SP |
490 | { |
491 | Z[k--] = zk; | |
492 | zk = ZERO; | |
493 | } | |
1066a534 SP |
494 | } |
495 | ||
bdf02814 | 496 | /* Normalize. */ |
1066a534 | 497 | for (i = 1; Z[i] == ZERO; i++); |
e4d82761 | 498 | EZ = EZ - i + 1; |
1066a534 | 499 | for (k = 1; i <= p + 1;) |
e4d82761 | 500 | Z[k++] = Z[i++]; |
1066a534 | 501 | for (; k <= p;) |
e4d82761 | 502 | Z[k++] = ZERO; |
e4d82761 UD |
503 | } |
504 | ||
950c99ca SP |
505 | /* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X |
506 | and Z or Y and Z. One guard digit is used. The error is less than one | |
507 | ULP. */ | |
31d3cc00 UD |
508 | void |
509 | SECTION | |
1066a534 SP |
510 | __add (const mp_no *x, const mp_no *y, mp_no *z, int p) |
511 | { | |
e4d82761 UD |
512 | int n; |
513 | ||
1066a534 SP |
514 | if (X[0] == ZERO) |
515 | { | |
516 | __cpy (y, z, p); | |
517 | return; | |
518 | } | |
519 | else if (Y[0] == ZERO) | |
520 | { | |
521 | __cpy (x, z, p); | |
522 | return; | |
523 | } | |
524 | ||
525 | if (X[0] == Y[0]) | |
526 | { | |
527 | if (__acr (x, y, p) > 0) | |
528 | { | |
529 | add_magnitudes (x, y, z, p); | |
530 | Z[0] = X[0]; | |
531 | } | |
532 | else | |
533 | { | |
534 | add_magnitudes (y, x, z, p); | |
535 | Z[0] = Y[0]; | |
536 | } | |
537 | } | |
538 | else | |
539 | { | |
540 | if ((n = __acr (x, y, p)) == 1) | |
541 | { | |
542 | sub_magnitudes (x, y, z, p); | |
543 | Z[0] = X[0]; | |
544 | } | |
545 | else if (n == -1) | |
546 | { | |
547 | sub_magnitudes (y, x, z, p); | |
548 | Z[0] = Y[0]; | |
549 | } | |
550 | else | |
551 | Z[0] = ZERO; | |
552 | } | |
e4d82761 UD |
553 | } |
554 | ||
950c99ca SP |
555 | /* Subtract *Y from *X and return the result in *Z. X and Y may overlap but |
556 | not X and Z or Y and Z. One guard digit is used. The error is less than | |
557 | one ULP. */ | |
31d3cc00 UD |
558 | void |
559 | SECTION | |
1066a534 SP |
560 | __sub (const mp_no *x, const mp_no *y, mp_no *z, int p) |
561 | { | |
e4d82761 UD |
562 | int n; |
563 | ||
1066a534 SP |
564 | if (X[0] == ZERO) |
565 | { | |
566 | __cpy (y, z, p); | |
567 | Z[0] = -Z[0]; | |
568 | return; | |
569 | } | |
570 | else if (Y[0] == ZERO) | |
571 | { | |
572 | __cpy (x, z, p); | |
573 | return; | |
574 | } | |
575 | ||
576 | if (X[0] != Y[0]) | |
577 | { | |
578 | if (__acr (x, y, p) > 0) | |
579 | { | |
580 | add_magnitudes (x, y, z, p); | |
581 | Z[0] = X[0]; | |
582 | } | |
583 | else | |
584 | { | |
585 | add_magnitudes (y, x, z, p); | |
586 | Z[0] = -Y[0]; | |
587 | } | |
588 | } | |
589 | else | |
590 | { | |
591 | if ((n = __acr (x, y, p)) == 1) | |
592 | { | |
593 | sub_magnitudes (x, y, z, p); | |
594 | Z[0] = X[0]; | |
595 | } | |
596 | else if (n == -1) | |
597 | { | |
598 | sub_magnitudes (y, x, z, p); | |
599 | Z[0] = -Y[0]; | |
600 | } | |
601 | else | |
602 | Z[0] = ZERO; | |
603 | } | |
e4d82761 UD |
604 | } |
605 | ||
950c99ca SP |
606 | /* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X |
607 | and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P | |
608 | digits. In case P > 3 the error is bounded by 1.001 ULP. */ | |
31d3cc00 UD |
609 | void |
610 | SECTION | |
1066a534 SP |
611 | __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) |
612 | { | |
909279a5 | 613 | int i, j, k, ip, ip2; |
2a91b573 | 614 | double u, zk; |
2d0e0f29 | 615 | const mp_no *a; |
e4d82761 | 616 | |
44e0d4c2 SP |
617 | /* Is z=0? */ |
618 | if (__glibc_unlikely (X[0] * Y[0] == ZERO)) | |
619 | { | |
1066a534 | 620 | Z[0] = ZERO; |
44e0d4c2 SP |
621 | return; |
622 | } | |
e4d82761 | 623 | |
909279a5 SP |
624 | /* We need not iterate through all X's and Y's since it's pointless to |
625 | multiply zeroes. Here, both are zero... */ | |
626 | for (ip2 = p; ip2 > 0; ip2--) | |
627 | if (X[ip2] != ZERO || Y[ip2] != ZERO) | |
628 | break; | |
44e0d4c2 | 629 | |
2d0e0f29 SP |
630 | a = X[ip2] != ZERO ? y : x; |
631 | ||
909279a5 SP |
632 | /* ... and here, at least one of them is still zero. */ |
633 | for (ip = ip2; ip > 0; ip--) | |
2d0e0f29 | 634 | if (a->d[ip] != ZERO) |
909279a5 SP |
635 | break; |
636 | ||
637 | /* The product looks like this for p = 3 (as an example): | |
638 | ||
639 | ||
640 | a1 a2 a3 | |
641 | x b1 b2 b3 | |
642 | ----------------------------- | |
643 | a1*b3 a2*b3 a3*b3 | |
644 | a1*b2 a2*b2 a3*b2 | |
645 | a1*b1 a2*b1 a3*b1 | |
646 | ||
647 | So our K needs to ideally be P*2, but we're limiting ourselves to P + 3 | |
648 | for P >= 3. We compute the above digits in two parts; the last P-1 | |
649 | digits and then the first P digits. The last P-1 digits are a sum of | |
650 | products of the input digits from P to P-k where K is 0 for the least | |
651 | significant digit and increases as we go towards the left. The product | |
652 | term is of the form X[k]*X[P-k] as can be seen in the above example. | |
653 | ||
654 | The first P digits are also a sum of products with the same product term, | |
655 | except that the sum is from 1 to k. This is also evident from the above | |
656 | example. | |
657 | ||
658 | Another thing that becomes evident is that only the most significant | |
659 | ip+ip2 digits of the result are non-zero, where ip and ip2 are the | |
660 | 'internal precision' of the input numbers, i.e. digits after ip and ip2 | |
661 | are all ZERO. */ | |
662 | ||
663 | k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; | |
664 | ||
665 | while (k > ip + ip2 + 1) | |
666 | Z[k--] = ZERO; | |
667 | ||
668 | zk = Z[k] = ZERO; | |
669 | ||
670 | while (k > p) | |
44e0d4c2 SP |
671 | { |
672 | for (i = k - p, j = p; i < p + 1; i++, j--) | |
2a91b573 | 673 | zk += X[i] * Y[j]; |
44e0d4c2 | 674 | |
2a91b573 SP |
675 | u = (zk + CUTTER) - CUTTER; |
676 | if (u > zk) | |
44e0d4c2 | 677 | u -= RADIX; |
909279a5 | 678 | Z[k--] = zk - u; |
2a91b573 | 679 | zk = u * RADIXI; |
44e0d4c2 SP |
680 | } |
681 | ||
909279a5 | 682 | /* The real deal. */ |
44e0d4c2 SP |
683 | while (k > 1) |
684 | { | |
1066a534 | 685 | for (i = 1, j = k - 1; i < k; i++, j--) |
2a91b573 | 686 | zk += X[i] * Y[j]; |
44e0d4c2 | 687 | |
2a91b573 SP |
688 | u = (zk + CUTTER) - CUTTER; |
689 | if (u > zk) | |
44e0d4c2 | 690 | u -= RADIX; |
909279a5 | 691 | Z[k--] = zk - u; |
2a91b573 | 692 | zk = u * RADIXI; |
44e0d4c2 | 693 | } |
2a91b573 | 694 | Z[k] = zk; |
44e0d4c2 | 695 | |
4709fe76 SP |
696 | /* Get the exponent sum into an intermediate variable. This is a subtle |
697 | optimization, where given enough registers, all operations on the exponent | |
698 | happen in registers and the result is written out only once into EZ. */ | |
699 | int e = EX + EY; | |
700 | ||
950c99ca | 701 | /* Is there a carry beyond the most significant digit? */ |
44e0d4c2 SP |
702 | if (__glibc_unlikely (Z[1] == ZERO)) |
703 | { | |
704 | for (i = 1; i <= p; i++) | |
1066a534 | 705 | Z[i] = Z[i + 1]; |
4709fe76 | 706 | e--; |
44e0d4c2 | 707 | } |
e4d82761 | 708 | |
4709fe76 | 709 | EZ = e; |
e4d82761 | 710 | Z[0] = X[0] * Y[0]; |
e4d82761 UD |
711 | } |
712 | ||
d6752ccd SP |
713 | /* Square *X and store result in *Y. X and Y may not overlap. For P in |
714 | [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the | |
715 | error is bounded by 1.001 ULP. This is a faster special case of | |
716 | multiplication. */ | |
717 | void | |
718 | SECTION | |
719 | __sqr (const mp_no *x, mp_no *y, int p) | |
720 | { | |
721 | long i, j, k, ip; | |
722 | double u, yk; | |
723 | ||
724 | /* Is z=0? */ | |
725 | if (__glibc_unlikely (X[0] == ZERO)) | |
726 | { | |
727 | Y[0] = ZERO; | |
728 | return; | |
729 | } | |
730 | ||
731 | /* We need not iterate through all X's since it's pointless to | |
732 | multiply zeroes. */ | |
733 | for (ip = p; ip > 0; ip--) | |
734 | if (X[ip] != ZERO) | |
735 | break; | |
736 | ||
737 | k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; | |
738 | ||
739 | while (k > 2 * ip + 1) | |
740 | Y[k--] = ZERO; | |
741 | ||
742 | yk = ZERO; | |
743 | ||
744 | while (k > p) | |
745 | { | |
746 | double yk2 = 0.0; | |
747 | long lim = k / 2; | |
748 | ||
749 | if (k % 2 == 0) | |
750 | { | |
751 | yk += X[lim] * X[lim]; | |
752 | lim--; | |
753 | } | |
754 | ||
755 | for (i = k - p, j = p; i <= lim; i++, j--) | |
756 | yk2 += X[i] * X[j]; | |
757 | ||
758 | yk += 2.0 * yk2; | |
759 | ||
760 | u = (yk + CUTTER) - CUTTER; | |
761 | if (u > yk) | |
762 | u -= RADIX; | |
763 | Y[k--] = yk - u; | |
764 | yk = u * RADIXI; | |
765 | } | |
766 | ||
767 | while (k > 1) | |
768 | { | |
769 | double yk2 = 0.0; | |
770 | long lim = k / 2; | |
771 | ||
772 | if (k % 2 == 0) | |
773 | { | |
774 | yk += X[lim] * X[lim]; | |
775 | lim--; | |
776 | } | |
777 | ||
778 | for (i = 1, j = k - 1; i <= lim; i++, j--) | |
779 | yk2 += X[i] * X[j]; | |
780 | ||
781 | yk += 2.0 * yk2; | |
782 | ||
783 | u = (yk + CUTTER) - CUTTER; | |
784 | if (u > yk) | |
785 | u -= RADIX; | |
786 | Y[k--] = yk - u; | |
787 | yk = u * RADIXI; | |
788 | } | |
789 | Y[k] = yk; | |
790 | ||
791 | /* Squares are always positive. */ | |
792 | Y[0] = 1.0; | |
793 | ||
4709fe76 SP |
794 | /* Get the exponent sum into an intermediate variable. This is a subtle |
795 | optimization, where given enough registers, all operations on the exponent | |
796 | happen in registers and the result is written out only once into EZ. */ | |
797 | int e = EX * 2; | |
798 | ||
d6752ccd SP |
799 | /* Is there a carry beyond the most significant digit? */ |
800 | if (__glibc_unlikely (Y[1] == ZERO)) | |
801 | { | |
802 | for (i = 1; i <= p; i++) | |
803 | Y[i] = Y[i + 1]; | |
4709fe76 | 804 | e--; |
d6752ccd | 805 | } |
4709fe76 SP |
806 | |
807 | EY = e; | |
d6752ccd SP |
808 | } |
809 | ||
950c99ca SP |
810 | /* Invert *X and store in *Y. Relative error bound: |
811 | - For P = 2: 1.001 * R ^ (1 - P) | |
812 | - For P = 3: 1.063 * R ^ (1 - P) | |
813 | - For P > 3: 2.001 * R ^ (1 - P) | |
e4d82761 | 814 | |
950c99ca | 815 | *X = 0 is not permissible. */ |
1066a534 | 816 | static void |
31d3cc00 | 817 | SECTION |
1066a534 SP |
818 | __inv (const mp_no *x, mp_no *y, int p) |
819 | { | |
50944bca | 820 | int i; |
e4d82761 | 821 | double t; |
1066a534 SP |
822 | mp_no z, w; |
823 | static const int np1[] = | |
824 | { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
825 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
826 | }; | |
827 | ||
828 | __cpy (x, &z, p); | |
829 | z.e = 0; | |
830 | __mp_dbl (&z, &t, p); | |
831 | t = ONE / t; | |
832 | __dbl_mp (t, y, p); | |
833 | EY -= EX; | |
834 | ||
835 | for (i = 0; i < np1[p]; i++) | |
836 | { | |
837 | __cpy (y, &w, p); | |
838 | __mul (x, &w, y, p); | |
839 | __sub (&mptwo, y, &z, p); | |
840 | __mul (&w, &z, y, p); | |
841 | } | |
e4d82761 UD |
842 | } |
843 | ||
950c99ca SP |
844 | /* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z |
845 | or Y and Z. Relative error bound: | |
846 | - For P = 2: 2.001 * R ^ (1 - P) | |
847 | - For P = 3: 2.063 * R ^ (1 - P) | |
848 | - For P > 3: 3.001 * R ^ (1 - P) | |
e4d82761 | 849 | |
950c99ca | 850 | *X = 0 is not permissible. */ |
31d3cc00 UD |
851 | void |
852 | SECTION | |
1066a534 SP |
853 | __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p) |
854 | { | |
e4d82761 UD |
855 | mp_no w; |
856 | ||
1066a534 SP |
857 | if (X[0] == ZERO) |
858 | Z[0] = ZERO; | |
859 | else | |
860 | { | |
861 | __inv (y, &w, p); | |
862 | __mul (x, &w, z, p); | |
863 | } | |
e4d82761 | 864 | } |