]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/ieee754/dbl-64/mpa.c
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / ieee754 / dbl-64 / mpa.c
CommitLineData
e4d82761
UD
1/*
2 * IBM Accurate Mathematical Library
aeb25823 3 * written by International Business Machines Corp.
b168057a 4 * Copyright (C) 2001-2015 Free Software Foundation, Inc.
e4d82761
UD
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as published by
cc7375ce 8 * the Free Software Foundation; either version 2.1 of the License, or
e4d82761 9 * (at your option) any later version.
50944bca 10 *
e4d82761
UD
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
c6c6dd48 14 * GNU Lesser General Public License for more details.
e4d82761
UD
15 *
16 * You should have received a copy of the GNU Lesser General Public License
59ba27a6 17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
e4d82761
UD
18 */
19/************************************************************************/
20/* MODULE_NAME: mpa.c */
21/* */
22/* FUNCTIONS: */
23/* mcr */
24/* acr */
e4d82761 25/* cpy */
e4d82761
UD
26/* norm */
27/* denorm */
28/* mp_dbl */
29/* dbl_mp */
30/* add_magnitudes */
31/* sub_magnitudes */
32/* add */
33/* sub */
34/* mul */
35/* inv */
36/* dvd */
37/* */
38/* Arithmetic functions for multiple precision numbers. */
39/* Relative errors are bounded */
40/************************************************************************/
41
42
43#include "endian.h"
44#include "mpa.h"
950c99ca 45#include <sys/param.h>
45f05884 46#include <alloca.h>
31d3cc00
UD
47
48#ifndef SECTION
49# define SECTION
50#endif
51
b76eb5f0 52#ifndef NO__CONST
107a5bf0
JM
53const mp_no __mpone = { 1, { 1.0, 1.0 } };
54const mp_no __mptwo = { 1, { 1.0, 2.0 } };
b76eb5f0
SP
55#endif
56
31d3cc00 57#ifndef NO___ACR
950c99ca
SP
58/* Compare mantissa of two multiple precision numbers regardless of the sign
59 and exponent of the numbers. */
31d3cc00 60static int
1066a534
SP
61mcr (const mp_no *x, const mp_no *y, int p)
62{
e69804d1
SP
63 long i;
64 long p2 = p;
65 for (i = 1; i <= p2; i++)
1066a534
SP
66 {
67 if (X[i] == Y[i])
68 continue;
69 else if (X[i] > Y[i])
70 return 1;
71 else
72 return -1;
73 }
e4d82761
UD
74 return 0;
75}
76
950c99ca 77/* Compare the absolute values of two multiple precision numbers. */
31d3cc00 78int
1066a534
SP
79__acr (const mp_no *x, const mp_no *y, int p)
80{
e69804d1 81 long i;
e4d82761 82
5739f705 83 if (X[0] == 0)
1066a534 84 {
5739f705 85 if (Y[0] == 0)
1066a534
SP
86 i = 0;
87 else
88 i = -1;
89 }
5739f705 90 else if (Y[0] == 0)
1066a534
SP
91 i = 1;
92 else
93 {
94 if (EX > EY)
95 i = 1;
96 else if (EX < EY)
97 i = -1;
98 else
99 i = mcr (x, y, p);
100 }
e4d82761
UD
101
102 return i;
103}
31d3cc00 104#endif
e4d82761 105
af968f62 106#ifndef NO___CPY
950c99ca
SP
107/* Copy multiple precision number X into Y. They could be the same
108 number. */
1066a534
SP
109void
110__cpy (const mp_no *x, mp_no *y, int p)
111{
e69804d1
SP
112 long i;
113
e4d82761 114 EY = EX;
e69804d1 115 for (i = 0; i <= p; i++)
1066a534 116 Y[i] = X[i];
e4d82761 117}
af968f62 118#endif
e4d82761 119
af968f62 120#ifndef NO___MP_DBL
950c99ca
SP
121/* Convert a multiple precision number *X into a double precision
122 number *Y, normalized case (|x| >= 2**(-1022))). */
1066a534
SP
123static void
124norm (const mp_no *x, double *y, int p)
50944bca 125{
c5d5d574 126# define R RADIXI
e69804d1 127 long i;
6f2e90e7
SP
128 double c;
129 mantissa_t a, u, v, z[5];
1066a534
SP
130 if (p < 5)
131 {
132 if (p == 1)
133 c = X[1];
134 else if (p == 2)
135 c = X[1] + R * X[2];
136 else if (p == 3)
137 c = X[1] + R * (X[2] + R * X[3]);
138 else if (p == 4)
139 c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]);
e4d82761 140 }
1066a534
SP
141 else
142 {
c5d5d574 143 for (a = 1, z[1] = X[1]; z[1] < TWO23; )
1066a534 144 {
5739f705
SP
145 a *= 2;
146 z[1] *= 2;
1066a534 147 }
e4d82761 148
1066a534
SP
149 for (i = 2; i < 5; i++)
150 {
6f2e90e7
SP
151 mantissa_store_t d, r;
152 d = X[i] * (mantissa_store_t) a;
153 DIV_RADIX (d, r);
154 z[i] = r;
155 z[i - 1] += d;
1066a534
SP
156 }
157
6f2e90e7 158 u = ALIGN_DOWN_TO (z[3], TWO19);
1066a534
SP
159 v = z[3] - u;
160
161 if (v == TWO18)
162 {
5739f705 163 if (z[4] == 0)
1066a534
SP
164 {
165 for (i = 5; i <= p; i++)
166 {
5739f705 167 if (X[i] == 0)
1066a534
SP
168 continue;
169 else
170 {
5739f705 171 z[3] += 1;
1066a534
SP
172 break;
173 }
174 }
175 }
176 else
5739f705 177 z[3] += 1;
1066a534
SP
178 }
179
180 c = (z[1] + R * (z[2] + R * z[3])) / a;
181 }
e4d82761
UD
182
183 c *= X[0];
184
1066a534
SP
185 for (i = 1; i < EX; i++)
186 c *= RADIX;
187 for (i = 1; i > EX; i--)
188 c *= RADIXI;
e4d82761
UD
189
190 *y = c;
c5d5d574 191# undef R
e4d82761
UD
192}
193
950c99ca
SP
194/* Convert a multiple precision number *X into a double precision
195 number *Y, Denormal case (|x| < 2**(-1022))). */
1066a534
SP
196static void
197denorm (const mp_no *x, double *y, int p)
50944bca 198{
e69804d1
SP
199 long i, k;
200 long p2 = p;
6f2e90e7
SP
201 double c;
202 mantissa_t u, z[5];
e4d82761 203
c5d5d574 204# define R RADIXI
1066a534
SP
205 if (EX < -44 || (EX == -44 && X[1] < TWO5))
206 {
5739f705 207 *y = 0;
1066a534
SP
208 return;
209 }
210
e69804d1 211 if (p2 == 1)
1066a534
SP
212 {
213 if (EX == -42)
214 {
215 z[1] = X[1] + TWO10;
5739f705
SP
216 z[2] = 0;
217 z[3] = 0;
1066a534
SP
218 k = 3;
219 }
220 else if (EX == -43)
221 {
222 z[1] = TWO10;
223 z[2] = X[1];
5739f705 224 z[3] = 0;
1066a534
SP
225 k = 2;
226 }
227 else
228 {
229 z[1] = TWO10;
5739f705 230 z[2] = 0;
1066a534
SP
231 z[3] = X[1];
232 k = 1;
233 }
234 }
e69804d1 235 else if (p2 == 2)
1066a534
SP
236 {
237 if (EX == -42)
238 {
239 z[1] = X[1] + TWO10;
240 z[2] = X[2];
5739f705 241 z[3] = 0;
1066a534
SP
242 k = 3;
243 }
244 else if (EX == -43)
245 {
246 z[1] = TWO10;
247 z[2] = X[1];
248 z[3] = X[2];
249 k = 2;
250 }
251 else
252 {
253 z[1] = TWO10;
5739f705 254 z[2] = 0;
1066a534
SP
255 z[3] = X[1];
256 k = 1;
257 }
258 }
259 else
260 {
261 if (EX == -42)
262 {
263 z[1] = X[1] + TWO10;
264 z[2] = X[2];
265 k = 3;
266 }
267 else if (EX == -43)
268 {
269 z[1] = TWO10;
270 z[2] = X[1];
271 k = 2;
272 }
273 else
274 {
275 z[1] = TWO10;
5739f705 276 z[2] = 0;
1066a534
SP
277 k = 1;
278 }
279 z[3] = X[k];
280 }
e4d82761 281
6f2e90e7 282 u = ALIGN_DOWN_TO (z[3], TWO5);
e4d82761 283
1066a534
SP
284 if (u == z[3])
285 {
e69804d1 286 for (i = k + 1; i <= p2; i++)
1066a534 287 {
5739f705 288 if (X[i] == 0)
1066a534
SP
289 continue;
290 else
291 {
5739f705 292 z[3] += 1;
1066a534
SP
293 break;
294 }
295 }
e4d82761 296 }
e4d82761 297
1066a534 298 c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10);
e4d82761 299
1066a534 300 *y = c * TWOM1032;
c5d5d574 301# undef R
e4d82761
UD
302}
303
950c99ca
SP
304/* Convert multiple precision number *X into double precision number *Y. The
305 result is correctly rounded to the nearest/even. */
1066a534
SP
306void
307__mp_dbl (const mp_no *x, double *y, int p)
308{
5739f705 309 if (X[0] == 0)
1066a534 310 {
5739f705 311 *y = 0;
1066a534
SP
312 return;
313 }
e4d82761 314
18ea052c 315 if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10)))
1066a534 316 norm (x, y, p);
18ea052c 317 else
1066a534 318 denorm (x, y, p);
e4d82761 319}
af968f62 320#endif
e4d82761 321
950c99ca
SP
322/* Get the multiple precision equivalent of X into *Y. If the precision is too
323 small, the result is truncated. */
31d3cc00
UD
324void
325SECTION
1066a534
SP
326__dbl_mp (double x, mp_no *y, int p)
327{
e69804d1
SP
328 long i, n;
329 long p2 = p;
e4d82761 330
950c99ca 331 /* Sign. */
5739f705 332 if (x == 0)
1066a534 333 {
5739f705 334 Y[0] = 0;
1066a534
SP
335 return;
336 }
5739f705
SP
337 else if (x > 0)
338 Y[0] = 1;
1066a534
SP
339 else
340 {
5739f705 341 Y[0] = -1;
1066a534
SP
342 x = -x;
343 }
e4d82761 344
950c99ca 345 /* Exponent. */
5739f705 346 for (EY = 1; x >= RADIX; EY += 1)
1066a534 347 x *= RADIXI;
5739f705 348 for (; x < 1; EY -= 1)
1066a534 349 x *= RADIX;
e4d82761 350
950c99ca 351 /* Digits. */
e69804d1 352 n = MIN (p2, 4);
1066a534
SP
353 for (i = 1; i <= n; i++)
354 {
6f2e90e7 355 INTEGER_OF (x, Y[i]);
1066a534
SP
356 x *= RADIX;
357 }
e69804d1 358 for (; i <= p2; i++)
5739f705 359 Y[i] = 0;
e4d82761
UD
360}
361
950c99ca
SP
362/* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The
363 sign of the sum *Z is not changed. X and Y may overlap but not X and Z or
364 Y and Z. No guard digit is used. The result equals the exact sum,
365 truncated. */
31d3cc00
UD
366static void
367SECTION
1066a534
SP
368add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
369{
e69804d1
SP
370 long i, j, k;
371 long p2 = p;
6f2e90e7 372 mantissa_t zk;
e4d82761
UD
373
374 EZ = EX;
375
e69804d1
SP
376 i = p2;
377 j = p2 + EY - EX;
378 k = p2 + 1;
1066a534 379
bdf02814 380 if (__glibc_unlikely (j < 1))
1066a534
SP
381 {
382 __cpy (x, z, p);
383 return;
384 }
bdf02814 385
5739f705 386 zk = 0;
1066a534
SP
387
388 for (; j > 0; i--, j--)
389 {
bdf02814
SP
390 zk += X[i] + Y[j];
391 if (zk >= RADIX)
1066a534 392 {
bdf02814 393 Z[k--] = zk - RADIX;
5739f705 394 zk = 1;
1066a534
SP
395 }
396 else
c5d5d574 397 {
bdf02814 398 Z[k--] = zk;
5739f705 399 zk = 0;
bdf02814 400 }
1066a534
SP
401 }
402
403 for (; i > 0; i--)
404 {
bdf02814
SP
405 zk += X[i];
406 if (zk >= RADIX)
1066a534 407 {
bdf02814 408 Z[k--] = zk - RADIX;
5739f705 409 zk = 1;
1066a534
SP
410 }
411 else
c5d5d574 412 {
bdf02814 413 Z[k--] = zk;
5739f705 414 zk = 0;
bdf02814 415 }
1066a534
SP
416 }
417
5739f705 418 if (zk == 0)
1066a534 419 {
e69804d1 420 for (i = 1; i <= p2; i++)
1066a534
SP
421 Z[i] = Z[i + 1];
422 }
423 else
bdf02814
SP
424 {
425 Z[1] = zk;
5739f705 426 EZ += 1;
bdf02814 427 }
e4d82761
UD
428}
429
950c99ca
SP
430/* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
431 The sign of the difference *Z is not changed. X and Y may overlap but not X
432 and Z or Y and Z. One guard digit is used. The error is less than one
433 ULP. */
31d3cc00
UD
434static void
435SECTION
1066a534
SP
436sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
437{
e69804d1
SP
438 long i, j, k;
439 long p2 = p;
6f2e90e7 440 mantissa_t zk;
e4d82761
UD
441
442 EZ = EX;
e69804d1
SP
443 i = p2;
444 j = p2 + EY - EX;
445 k = p2;
e4d82761 446
bdf02814
SP
447 /* Y is too small compared to X, copy X over to the result. */
448 if (__glibc_unlikely (j < 1))
1066a534 449 {
bdf02814
SP
450 __cpy (x, z, p);
451 return;
1066a534 452 }
bdf02814
SP
453
454 /* The relevant least significant digit in Y is non-zero, so we factor it in
455 to enhance accuracy. */
5739f705 456 if (j < p2 && Y[j + 1] > 0)
1066a534 457 {
bdf02814 458 Z[k + 1] = RADIX - Y[j + 1];
5739f705 459 zk = -1;
1066a534 460 }
bdf02814 461 else
5739f705 462 zk = Z[k + 1] = 0;
1066a534 463
bdf02814 464 /* Subtract and borrow. */
1066a534
SP
465 for (; j > 0; i--, j--)
466 {
bdf02814 467 zk += (X[i] - Y[j]);
5739f705 468 if (zk < 0)
1066a534 469 {
bdf02814 470 Z[k--] = zk + RADIX;
5739f705 471 zk = -1;
1066a534
SP
472 }
473 else
c5d5d574 474 {
bdf02814 475 Z[k--] = zk;
5739f705 476 zk = 0;
bdf02814 477 }
1066a534
SP
478 }
479
bdf02814 480 /* We're done with digits from Y, so it's just digits in X. */
1066a534
SP
481 for (; i > 0; i--)
482 {
bdf02814 483 zk += X[i];
5739f705 484 if (zk < 0)
1066a534 485 {
bdf02814 486 Z[k--] = zk + RADIX;
5739f705 487 zk = -1;
1066a534
SP
488 }
489 else
c5d5d574 490 {
bdf02814 491 Z[k--] = zk;
5739f705 492 zk = 0;
bdf02814 493 }
1066a534
SP
494 }
495
bdf02814 496 /* Normalize. */
c5d5d574
OB
497 for (i = 1; Z[i] == 0; i++)
498 ;
e4d82761 499 EZ = EZ - i + 1;
c5d5d574 500 for (k = 1; i <= p2 + 1; )
e4d82761 501 Z[k++] = Z[i++];
c5d5d574 502 for (; k <= p2; )
5739f705 503 Z[k++] = 0;
e4d82761
UD
504}
505
950c99ca
SP
506/* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X
507 and Z or Y and Z. One guard digit is used. The error is less than one
508 ULP. */
31d3cc00
UD
509void
510SECTION
1066a534
SP
511__add (const mp_no *x, const mp_no *y, mp_no *z, int p)
512{
e4d82761
UD
513 int n;
514
5739f705 515 if (X[0] == 0)
1066a534
SP
516 {
517 __cpy (y, z, p);
518 return;
519 }
5739f705 520 else if (Y[0] == 0)
1066a534
SP
521 {
522 __cpy (x, z, p);
523 return;
524 }
525
526 if (X[0] == Y[0])
527 {
528 if (__acr (x, y, p) > 0)
529 {
530 add_magnitudes (x, y, z, p);
531 Z[0] = X[0];
532 }
533 else
534 {
535 add_magnitudes (y, x, z, p);
536 Z[0] = Y[0];
537 }
538 }
539 else
540 {
541 if ((n = __acr (x, y, p)) == 1)
542 {
543 sub_magnitudes (x, y, z, p);
544 Z[0] = X[0];
545 }
546 else if (n == -1)
547 {
548 sub_magnitudes (y, x, z, p);
549 Z[0] = Y[0];
550 }
551 else
5739f705 552 Z[0] = 0;
1066a534 553 }
e4d82761
UD
554}
555
950c99ca
SP
556/* Subtract *Y from *X and return the result in *Z. X and Y may overlap but
557 not X and Z or Y and Z. One guard digit is used. The error is less than
558 one ULP. */
31d3cc00
UD
559void
560SECTION
1066a534
SP
561__sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
562{
e4d82761
UD
563 int n;
564
5739f705 565 if (X[0] == 0)
1066a534
SP
566 {
567 __cpy (y, z, p);
568 Z[0] = -Z[0];
569 return;
570 }
5739f705 571 else if (Y[0] == 0)
1066a534
SP
572 {
573 __cpy (x, z, p);
574 return;
575 }
576
577 if (X[0] != Y[0])
578 {
579 if (__acr (x, y, p) > 0)
580 {
581 add_magnitudes (x, y, z, p);
582 Z[0] = X[0];
583 }
584 else
585 {
586 add_magnitudes (y, x, z, p);
587 Z[0] = -Y[0];
588 }
589 }
590 else
591 {
592 if ((n = __acr (x, y, p)) == 1)
593 {
594 sub_magnitudes (x, y, z, p);
595 Z[0] = X[0];
596 }
597 else if (n == -1)
598 {
599 sub_magnitudes (y, x, z, p);
600 Z[0] = -Y[0];
601 }
602 else
5739f705 603 Z[0] = 0;
1066a534 604 }
e4d82761
UD
605}
606
82a9811d 607#ifndef NO__MUL
950c99ca
SP
608/* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X
609 and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P
610 digits. In case P > 3 the error is bounded by 1.001 ULP. */
31d3cc00
UD
611void
612SECTION
1066a534
SP
613__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
614{
e69804d1
SP
615 long i, j, k, ip, ip2;
616 long p2 = p;
6f2e90e7 617 mantissa_store_t zk;
2d0e0f29 618 const mp_no *a;
6f2e90e7 619 mantissa_store_t *diag;
e4d82761 620
44e0d4c2 621 /* Is z=0? */
5739f705 622 if (__glibc_unlikely (X[0] * Y[0] == 0))
44e0d4c2 623 {
5739f705 624 Z[0] = 0;
44e0d4c2
SP
625 return;
626 }
e4d82761 627
909279a5
SP
628 /* We need not iterate through all X's and Y's since it's pointless to
629 multiply zeroes. Here, both are zero... */
e69804d1 630 for (ip2 = p2; ip2 > 0; ip2--)
5739f705 631 if (X[ip2] != 0 || Y[ip2] != 0)
909279a5 632 break;
44e0d4c2 633
5739f705 634 a = X[ip2] != 0 ? y : x;
2d0e0f29 635
909279a5
SP
636 /* ... and here, at least one of them is still zero. */
637 for (ip = ip2; ip > 0; ip--)
5739f705 638 if (a->d[ip] != 0)
909279a5
SP
639 break;
640
641 /* The product looks like this for p = 3 (as an example):
642
643
644 a1 a2 a3
645 x b1 b2 b3
646 -----------------------------
647 a1*b3 a2*b3 a3*b3
648 a1*b2 a2*b2 a3*b2
649 a1*b1 a2*b1 a3*b1
650
651 So our K needs to ideally be P*2, but we're limiting ourselves to P + 3
652 for P >= 3. We compute the above digits in two parts; the last P-1
653 digits and then the first P digits. The last P-1 digits are a sum of
654 products of the input digits from P to P-k where K is 0 for the least
655 significant digit and increases as we go towards the left. The product
656 term is of the form X[k]*X[P-k] as can be seen in the above example.
657
658 The first P digits are also a sum of products with the same product term,
659 except that the sum is from 1 to k. This is also evident from the above
660 example.
661
662 Another thing that becomes evident is that only the most significant
663 ip+ip2 digits of the result are non-zero, where ip and ip2 are the
664 'internal precision' of the input numbers, i.e. digits after ip and ip2
5739f705 665 are all 0. */
909279a5 666
e69804d1 667 k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
909279a5
SP
668
669 while (k > ip + ip2 + 1)
5739f705 670 Z[k--] = 0;
909279a5 671
5739f705 672 zk = 0;
45f05884
SP
673
674 /* Precompute sums of diagonal elements so that we can directly use them
675 later. See the next comment to know we why need them. */
6f2e90e7 676 diag = alloca (k * sizeof (mantissa_store_t));
5739f705 677 mantissa_store_t d = 0;
45f05884
SP
678 for (i = 1; i <= ip; i++)
679 {
6f2e90e7 680 d += X[i] * (mantissa_store_t) Y[i];
45f05884
SP
681 diag[i] = d;
682 }
683 while (i < k)
684 diag[i++] = d;
909279a5 685
e69804d1 686 while (k > p2)
44e0d4c2 687 {
45f05884
SP
688 long lim = k / 2;
689
690 if (k % 2 == 0)
691 /* We want to add this only once, but since we subtract it in the sum
692 of products above, we add twice. */
6f2e90e7 693 zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
45f05884
SP
694
695 for (i = k - p2, j = p2; i < j; i++, j--)
6f2e90e7 696 zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
45f05884
SP
697
698 zk -= diag[k - 1];
44e0d4c2 699
6f2e90e7
SP
700 DIV_RADIX (zk, Z[k]);
701 k--;
44e0d4c2
SP
702 }
703
45f05884
SP
704 /* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
705 goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the
706 number of multiplications, we halve the range and if k is an even number,
707 add the diagonal element X[k/2]Y[k/2]. Through the half range, we compute
708 X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j].
709
710 This reduction tells us that we're summing two things, the first term
711 through the half range and the negative of the sum of the product of all
712 terms of X and Y in the full range. i.e.
713
714 SUM(X[i] * Y[i]) for k terms. This is precalculated above for each k in
715 a single loop so that it completes in O(n) time and can hence be directly
716 used in the loop below. */
44e0d4c2
SP
717 while (k > 1)
718 {
45f05884
SP
719 long lim = k / 2;
720
721 if (k % 2 == 0)
722 /* We want to add this only once, but since we subtract it in the sum
723 of products above, we add twice. */
6f2e90e7 724 zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
45f05884
SP
725
726 for (i = 1, j = k - 1; i < j; i++, j--)
6f2e90e7 727 zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
45f05884
SP
728
729 zk -= diag[k - 1];
44e0d4c2 730
6f2e90e7
SP
731 DIV_RADIX (zk, Z[k]);
732 k--;
44e0d4c2 733 }
2a91b573 734 Z[k] = zk;
44e0d4c2 735
4709fe76
SP
736 /* Get the exponent sum into an intermediate variable. This is a subtle
737 optimization, where given enough registers, all operations on the exponent
738 happen in registers and the result is written out only once into EZ. */
739 int e = EX + EY;
740
950c99ca 741 /* Is there a carry beyond the most significant digit? */
5739f705 742 if (__glibc_unlikely (Z[1] == 0))
44e0d4c2 743 {
e69804d1 744 for (i = 1; i <= p2; i++)
1066a534 745 Z[i] = Z[i + 1];
4709fe76 746 e--;
44e0d4c2 747 }
e4d82761 748
4709fe76 749 EZ = e;
e4d82761 750 Z[0] = X[0] * Y[0];
e4d82761 751}
82a9811d 752#endif
e4d82761 753
82a9811d 754#ifndef NO__SQR
d6752ccd
SP
755/* Square *X and store result in *Y. X and Y may not overlap. For P in
756 [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
757 error is bounded by 1.001 ULP. This is a faster special case of
758 multiplication. */
759void
760SECTION
761__sqr (const mp_no *x, mp_no *y, int p)
762{
763 long i, j, k, ip;
6f2e90e7 764 mantissa_store_t yk;
d6752ccd
SP
765
766 /* Is z=0? */
5739f705 767 if (__glibc_unlikely (X[0] == 0))
d6752ccd 768 {
5739f705 769 Y[0] = 0;
d6752ccd
SP
770 return;
771 }
772
773 /* We need not iterate through all X's since it's pointless to
774 multiply zeroes. */
775 for (ip = p; ip > 0; ip--)
5739f705 776 if (X[ip] != 0)
d6752ccd
SP
777 break;
778
779 k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
780
781 while (k > 2 * ip + 1)
5739f705 782 Y[k--] = 0;
d6752ccd 783
5739f705 784 yk = 0;
d6752ccd
SP
785
786 while (k > p)
787 {
6f2e90e7 788 mantissa_store_t yk2 = 0;
d6752ccd
SP
789 long lim = k / 2;
790
791 if (k % 2 == 0)
6f2e90e7 792 yk += X[lim] * (mantissa_store_t) X[lim];
d6752ccd 793
20cd7fb3
SP
794 /* In __mul, this loop (and the one within the next while loop) run
795 between a range to calculate the mantissa as follows:
796
797 Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
798 + X[n] * Y[k]
799
800 For X == Y, we can get away with summing halfway and doubling the
801 result. For cases where the range size is even, the mid-point needs
802 to be added separately (above). */
22af19f9 803 for (i = k - p, j = p; i < j; i++, j--)
6f2e90e7 804 yk2 += X[i] * (mantissa_store_t) X[j];
d6752ccd 805
5739f705 806 yk += 2 * yk2;
d6752ccd 807
6f2e90e7
SP
808 DIV_RADIX (yk, Y[k]);
809 k--;
d6752ccd
SP
810 }
811
812 while (k > 1)
813 {
6f2e90e7 814 mantissa_store_t yk2 = 0;
d6752ccd
SP
815 long lim = k / 2;
816
817 if (k % 2 == 0)
6f2e90e7 818 yk += X[lim] * (mantissa_store_t) X[lim];
d6752ccd 819
20cd7fb3 820 /* Likewise for this loop. */
22af19f9 821 for (i = 1, j = k - 1; i < j; i++, j--)
6f2e90e7 822 yk2 += X[i] * (mantissa_store_t) X[j];
d6752ccd 823
5739f705 824 yk += 2 * yk2;
d6752ccd 825
6f2e90e7
SP
826 DIV_RADIX (yk, Y[k]);
827 k--;
d6752ccd
SP
828 }
829 Y[k] = yk;
830
831 /* Squares are always positive. */
5739f705 832 Y[0] = 1;
d6752ccd 833
4709fe76
SP
834 /* Get the exponent sum into an intermediate variable. This is a subtle
835 optimization, where given enough registers, all operations on the exponent
836 happen in registers and the result is written out only once into EZ. */
837 int e = EX * 2;
838
d6752ccd 839 /* Is there a carry beyond the most significant digit? */
5739f705 840 if (__glibc_unlikely (Y[1] == 0))
d6752ccd
SP
841 {
842 for (i = 1; i <= p; i++)
843 Y[i] = Y[i + 1];
4709fe76 844 e--;
d6752ccd 845 }
4709fe76
SP
846
847 EY = e;
d6752ccd 848}
82a9811d 849#endif
d6752ccd 850
950c99ca
SP
851/* Invert *X and store in *Y. Relative error bound:
852 - For P = 2: 1.001 * R ^ (1 - P)
853 - For P = 3: 1.063 * R ^ (1 - P)
854 - For P > 3: 2.001 * R ^ (1 - P)
e4d82761 855
950c99ca 856 *X = 0 is not permissible. */
1066a534 857static void
31d3cc00 858SECTION
1066a534
SP
859__inv (const mp_no *x, mp_no *y, int p)
860{
e69804d1 861 long i;
e4d82761 862 double t;
1066a534
SP
863 mp_no z, w;
864 static const int np1[] =
865 { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
866 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
867 };
868
869 __cpy (x, &z, p);
870 z.e = 0;
871 __mp_dbl (&z, &t, p);
5739f705 872 t = 1 / t;
1066a534
SP
873 __dbl_mp (t, y, p);
874 EY -= EX;
875
876 for (i = 0; i < np1[p]; i++)
877 {
878 __cpy (y, &w, p);
879 __mul (x, &w, y, p);
107a5bf0 880 __sub (&__mptwo, y, &z, p);
1066a534
SP
881 __mul (&w, &z, y, p);
882 }
e4d82761
UD
883}
884
950c99ca
SP
885/* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z
886 or Y and Z. Relative error bound:
887 - For P = 2: 2.001 * R ^ (1 - P)
888 - For P = 3: 2.063 * R ^ (1 - P)
889 - For P > 3: 3.001 * R ^ (1 - P)
e4d82761 890
950c99ca 891 *X = 0 is not permissible. */
31d3cc00
UD
892void
893SECTION
1066a534
SP
894__dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
895{
e4d82761
UD
896 mp_no w;
897
5739f705
SP
898 if (X[0] == 0)
899 Z[0] = 0;
1066a534
SP
900 else
901 {
902 __inv (y, &w, p);
903 __mul (x, &w, z, p);
904 }
e4d82761 905}