]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/ieee754/dbl-64/mpa.c
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / ieee754 / dbl-64 / mpa.c
CommitLineData
e4d82761
UD
1/*
2 * IBM Accurate Mathematical Library
aeb25823 3 * written by International Business Machines Corp.
d614a753 4 * Copyright (C) 2001-2020 Free Software Foundation, Inc.
e4d82761
UD
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as published by
cc7375ce 8 * the Free Software Foundation; either version 2.1 of the License, or
e4d82761 9 * (at your option) any later version.
50944bca 10 *
e4d82761
UD
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
c6c6dd48 14 * GNU Lesser General Public License for more details.
e4d82761
UD
15 *
16 * You should have received a copy of the GNU Lesser General Public License
5a82c748 17 * along with this program; if not, see <https://www.gnu.org/licenses/>.
e4d82761
UD
18 */
19/************************************************************************/
20/* MODULE_NAME: mpa.c */
21/* */
22/* FUNCTIONS: */
23/* mcr */
24/* acr */
e4d82761 25/* cpy */
e4d82761
UD
26/* norm */
27/* denorm */
28/* mp_dbl */
29/* dbl_mp */
30/* add_magnitudes */
31/* sub_magnitudes */
32/* add */
33/* sub */
34/* mul */
35/* inv */
36/* dvd */
37/* */
38/* Arithmetic functions for multiple precision numbers. */
39/* Relative errors are bounded */
40/************************************************************************/
41
42
43#include "endian.h"
44#include "mpa.h"
950c99ca 45#include <sys/param.h>
45f05884 46#include <alloca.h>
31d3cc00
UD
47
48#ifndef SECTION
49# define SECTION
50#endif
51
b76eb5f0 52#ifndef NO__CONST
107a5bf0
JM
53const mp_no __mpone = { 1, { 1.0, 1.0 } };
54const mp_no __mptwo = { 1, { 1.0, 2.0 } };
b76eb5f0
SP
55#endif
56
31d3cc00 57#ifndef NO___ACR
950c99ca
SP
58/* Compare mantissa of two multiple precision numbers regardless of the sign
59 and exponent of the numbers. */
31d3cc00 60static int
1066a534
SP
61mcr (const mp_no *x, const mp_no *y, int p)
62{
e69804d1
SP
63 long i;
64 long p2 = p;
65 for (i = 1; i <= p2; i++)
1066a534
SP
66 {
67 if (X[i] == Y[i])
68 continue;
69 else if (X[i] > Y[i])
70 return 1;
71 else
72 return -1;
73 }
e4d82761
UD
74 return 0;
75}
76
950c99ca 77/* Compare the absolute values of two multiple precision numbers. */
31d3cc00 78int
1066a534
SP
79__acr (const mp_no *x, const mp_no *y, int p)
80{
e69804d1 81 long i;
e4d82761 82
5739f705 83 if (X[0] == 0)
1066a534 84 {
5739f705 85 if (Y[0] == 0)
1066a534
SP
86 i = 0;
87 else
88 i = -1;
89 }
5739f705 90 else if (Y[0] == 0)
1066a534
SP
91 i = 1;
92 else
93 {
94 if (EX > EY)
95 i = 1;
96 else if (EX < EY)
97 i = -1;
98 else
99 i = mcr (x, y, p);
100 }
e4d82761
UD
101
102 return i;
103}
31d3cc00 104#endif
e4d82761 105
af968f62 106#ifndef NO___CPY
950c99ca
SP
107/* Copy multiple precision number X into Y. They could be the same
108 number. */
1066a534
SP
109void
110__cpy (const mp_no *x, mp_no *y, int p)
111{
e69804d1
SP
112 long i;
113
e4d82761 114 EY = EX;
e69804d1 115 for (i = 0; i <= p; i++)
1066a534 116 Y[i] = X[i];
e4d82761 117}
af968f62 118#endif
e4d82761 119
af968f62 120#ifndef NO___MP_DBL
950c99ca 121/* Convert a multiple precision number *X into a double precision
89f3b6e1
JM
122 number *Y, normalized case (|x| >= 2**(-1022))). X has precision
123 P, which is positive. */
1066a534
SP
124static void
125norm (const mp_no *x, double *y, int p)
50944bca 126{
c5d5d574 127# define R RADIXI
e69804d1 128 long i;
6f2e90e7
SP
129 double c;
130 mantissa_t a, u, v, z[5];
1066a534
SP
131 if (p < 5)
132 {
133 if (p == 1)
134 c = X[1];
135 else if (p == 2)
136 c = X[1] + R * X[2];
137 else if (p == 3)
138 c = X[1] + R * (X[2] + R * X[3]);
89f3b6e1 139 else /* p == 4. */
1066a534 140 c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]);
e4d82761 141 }
1066a534
SP
142 else
143 {
c5d5d574 144 for (a = 1, z[1] = X[1]; z[1] < TWO23; )
1066a534 145 {
5739f705
SP
146 a *= 2;
147 z[1] *= 2;
1066a534 148 }
e4d82761 149
1066a534
SP
150 for (i = 2; i < 5; i++)
151 {
6f2e90e7
SP
152 mantissa_store_t d, r;
153 d = X[i] * (mantissa_store_t) a;
154 DIV_RADIX (d, r);
155 z[i] = r;
156 z[i - 1] += d;
1066a534
SP
157 }
158
6f2e90e7 159 u = ALIGN_DOWN_TO (z[3], TWO19);
1066a534
SP
160 v = z[3] - u;
161
162 if (v == TWO18)
163 {
5739f705 164 if (z[4] == 0)
1066a534
SP
165 {
166 for (i = 5; i <= p; i++)
167 {
5739f705 168 if (X[i] == 0)
1066a534
SP
169 continue;
170 else
171 {
5739f705 172 z[3] += 1;
1066a534
SP
173 break;
174 }
175 }
176 }
177 else
5739f705 178 z[3] += 1;
1066a534
SP
179 }
180
181 c = (z[1] + R * (z[2] + R * z[3])) / a;
182 }
e4d82761
UD
183
184 c *= X[0];
185
1066a534
SP
186 for (i = 1; i < EX; i++)
187 c *= RADIX;
188 for (i = 1; i > EX; i--)
189 c *= RADIXI;
e4d82761
UD
190
191 *y = c;
c5d5d574 192# undef R
e4d82761
UD
193}
194
950c99ca
SP
195/* Convert a multiple precision number *X into a double precision
196 number *Y, Denormal case (|x| < 2**(-1022))). */
1066a534
SP
197static void
198denorm (const mp_no *x, double *y, int p)
50944bca 199{
e69804d1
SP
200 long i, k;
201 long p2 = p;
6f2e90e7
SP
202 double c;
203 mantissa_t u, z[5];
e4d82761 204
c5d5d574 205# define R RADIXI
1066a534
SP
206 if (EX < -44 || (EX == -44 && X[1] < TWO5))
207 {
5739f705 208 *y = 0;
1066a534
SP
209 return;
210 }
211
e69804d1 212 if (p2 == 1)
1066a534
SP
213 {
214 if (EX == -42)
215 {
216 z[1] = X[1] + TWO10;
5739f705
SP
217 z[2] = 0;
218 z[3] = 0;
1066a534
SP
219 k = 3;
220 }
221 else if (EX == -43)
222 {
223 z[1] = TWO10;
224 z[2] = X[1];
5739f705 225 z[3] = 0;
1066a534
SP
226 k = 2;
227 }
228 else
229 {
230 z[1] = TWO10;
5739f705 231 z[2] = 0;
1066a534
SP
232 z[3] = X[1];
233 k = 1;
234 }
235 }
e69804d1 236 else if (p2 == 2)
1066a534
SP
237 {
238 if (EX == -42)
239 {
240 z[1] = X[1] + TWO10;
241 z[2] = X[2];
5739f705 242 z[3] = 0;
1066a534
SP
243 k = 3;
244 }
245 else if (EX == -43)
246 {
247 z[1] = TWO10;
248 z[2] = X[1];
249 z[3] = X[2];
250 k = 2;
251 }
252 else
253 {
254 z[1] = TWO10;
5739f705 255 z[2] = 0;
1066a534
SP
256 z[3] = X[1];
257 k = 1;
258 }
259 }
260 else
261 {
262 if (EX == -42)
263 {
264 z[1] = X[1] + TWO10;
265 z[2] = X[2];
266 k = 3;
267 }
268 else if (EX == -43)
269 {
270 z[1] = TWO10;
271 z[2] = X[1];
272 k = 2;
273 }
274 else
275 {
276 z[1] = TWO10;
5739f705 277 z[2] = 0;
1066a534
SP
278 k = 1;
279 }
280 z[3] = X[k];
281 }
e4d82761 282
6f2e90e7 283 u = ALIGN_DOWN_TO (z[3], TWO5);
e4d82761 284
1066a534
SP
285 if (u == z[3])
286 {
e69804d1 287 for (i = k + 1; i <= p2; i++)
1066a534 288 {
5739f705 289 if (X[i] == 0)
1066a534
SP
290 continue;
291 else
292 {
5739f705 293 z[3] += 1;
1066a534
SP
294 break;
295 }
296 }
e4d82761 297 }
e4d82761 298
1066a534 299 c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10);
e4d82761 300
1066a534 301 *y = c * TWOM1032;
c5d5d574 302# undef R
e4d82761
UD
303}
304
950c99ca
SP
305/* Convert multiple precision number *X into double precision number *Y. The
306 result is correctly rounded to the nearest/even. */
1066a534
SP
307void
308__mp_dbl (const mp_no *x, double *y, int p)
309{
5739f705 310 if (X[0] == 0)
1066a534 311 {
5739f705 312 *y = 0;
1066a534
SP
313 return;
314 }
e4d82761 315
18ea052c 316 if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10)))
1066a534 317 norm (x, y, p);
18ea052c 318 else
1066a534 319 denorm (x, y, p);
e4d82761 320}
af968f62 321#endif
e4d82761 322
950c99ca
SP
323/* Get the multiple precision equivalent of X into *Y. If the precision is too
324 small, the result is truncated. */
31d3cc00
UD
325void
326SECTION
1066a534
SP
327__dbl_mp (double x, mp_no *y, int p)
328{
e69804d1
SP
329 long i, n;
330 long p2 = p;
e4d82761 331
950c99ca 332 /* Sign. */
5739f705 333 if (x == 0)
1066a534 334 {
5739f705 335 Y[0] = 0;
1066a534
SP
336 return;
337 }
5739f705
SP
338 else if (x > 0)
339 Y[0] = 1;
1066a534
SP
340 else
341 {
5739f705 342 Y[0] = -1;
1066a534
SP
343 x = -x;
344 }
e4d82761 345
950c99ca 346 /* Exponent. */
5739f705 347 for (EY = 1; x >= RADIX; EY += 1)
1066a534 348 x *= RADIXI;
5739f705 349 for (; x < 1; EY -= 1)
1066a534 350 x *= RADIX;
e4d82761 351
950c99ca 352 /* Digits. */
e69804d1 353 n = MIN (p2, 4);
1066a534
SP
354 for (i = 1; i <= n; i++)
355 {
6f2e90e7 356 INTEGER_OF (x, Y[i]);
1066a534
SP
357 x *= RADIX;
358 }
e69804d1 359 for (; i <= p2; i++)
5739f705 360 Y[i] = 0;
e4d82761
UD
361}
362
950c99ca
SP
363/* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The
364 sign of the sum *Z is not changed. X and Y may overlap but not X and Z or
365 Y and Z. No guard digit is used. The result equals the exact sum,
366 truncated. */
31d3cc00
UD
367static void
368SECTION
1066a534
SP
369add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
370{
e69804d1
SP
371 long i, j, k;
372 long p2 = p;
6f2e90e7 373 mantissa_t zk;
e4d82761
UD
374
375 EZ = EX;
376
e69804d1
SP
377 i = p2;
378 j = p2 + EY - EX;
379 k = p2 + 1;
1066a534 380
bdf02814 381 if (__glibc_unlikely (j < 1))
1066a534
SP
382 {
383 __cpy (x, z, p);
384 return;
385 }
bdf02814 386
5739f705 387 zk = 0;
1066a534
SP
388
389 for (; j > 0; i--, j--)
390 {
bdf02814
SP
391 zk += X[i] + Y[j];
392 if (zk >= RADIX)
1066a534 393 {
bdf02814 394 Z[k--] = zk - RADIX;
5739f705 395 zk = 1;
1066a534
SP
396 }
397 else
c5d5d574 398 {
bdf02814 399 Z[k--] = zk;
5739f705 400 zk = 0;
bdf02814 401 }
1066a534
SP
402 }
403
404 for (; i > 0; i--)
405 {
bdf02814
SP
406 zk += X[i];
407 if (zk >= RADIX)
1066a534 408 {
bdf02814 409 Z[k--] = zk - RADIX;
5739f705 410 zk = 1;
1066a534
SP
411 }
412 else
c5d5d574 413 {
bdf02814 414 Z[k--] = zk;
5739f705 415 zk = 0;
bdf02814 416 }
1066a534
SP
417 }
418
5739f705 419 if (zk == 0)
1066a534 420 {
e69804d1 421 for (i = 1; i <= p2; i++)
1066a534
SP
422 Z[i] = Z[i + 1];
423 }
424 else
bdf02814
SP
425 {
426 Z[1] = zk;
5739f705 427 EZ += 1;
bdf02814 428 }
e4d82761
UD
429}
430
950c99ca
SP
431/* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
432 The sign of the difference *Z is not changed. X and Y may overlap but not X
433 and Z or Y and Z. One guard digit is used. The error is less than one
434 ULP. */
31d3cc00
UD
435static void
436SECTION
1066a534
SP
437sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
438{
e69804d1
SP
439 long i, j, k;
440 long p2 = p;
6f2e90e7 441 mantissa_t zk;
e4d82761
UD
442
443 EZ = EX;
e69804d1
SP
444 i = p2;
445 j = p2 + EY - EX;
446 k = p2;
e4d82761 447
bdf02814
SP
448 /* Y is too small compared to X, copy X over to the result. */
449 if (__glibc_unlikely (j < 1))
1066a534 450 {
bdf02814
SP
451 __cpy (x, z, p);
452 return;
1066a534 453 }
bdf02814
SP
454
455 /* The relevant least significant digit in Y is non-zero, so we factor it in
456 to enhance accuracy. */
5739f705 457 if (j < p2 && Y[j + 1] > 0)
1066a534 458 {
bdf02814 459 Z[k + 1] = RADIX - Y[j + 1];
5739f705 460 zk = -1;
1066a534 461 }
bdf02814 462 else
5739f705 463 zk = Z[k + 1] = 0;
1066a534 464
bdf02814 465 /* Subtract and borrow. */
1066a534
SP
466 for (; j > 0; i--, j--)
467 {
bdf02814 468 zk += (X[i] - Y[j]);
5739f705 469 if (zk < 0)
1066a534 470 {
bdf02814 471 Z[k--] = zk + RADIX;
5739f705 472 zk = -1;
1066a534
SP
473 }
474 else
c5d5d574 475 {
bdf02814 476 Z[k--] = zk;
5739f705 477 zk = 0;
bdf02814 478 }
1066a534
SP
479 }
480
bdf02814 481 /* We're done with digits from Y, so it's just digits in X. */
1066a534
SP
482 for (; i > 0; i--)
483 {
bdf02814 484 zk += X[i];
5739f705 485 if (zk < 0)
1066a534 486 {
bdf02814 487 Z[k--] = zk + RADIX;
5739f705 488 zk = -1;
1066a534
SP
489 }
490 else
c5d5d574 491 {
bdf02814 492 Z[k--] = zk;
5739f705 493 zk = 0;
bdf02814 494 }
1066a534
SP
495 }
496
bdf02814 497 /* Normalize. */
c5d5d574
OB
498 for (i = 1; Z[i] == 0; i++)
499 ;
e4d82761 500 EZ = EZ - i + 1;
c5d5d574 501 for (k = 1; i <= p2 + 1; )
e4d82761 502 Z[k++] = Z[i++];
c5d5d574 503 for (; k <= p2; )
5739f705 504 Z[k++] = 0;
e4d82761
UD
505}
506
950c99ca
SP
507/* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X
508 and Z or Y and Z. One guard digit is used. The error is less than one
509 ULP. */
31d3cc00
UD
510void
511SECTION
1066a534
SP
512__add (const mp_no *x, const mp_no *y, mp_no *z, int p)
513{
e4d82761
UD
514 int n;
515
5739f705 516 if (X[0] == 0)
1066a534
SP
517 {
518 __cpy (y, z, p);
519 return;
520 }
5739f705 521 else if (Y[0] == 0)
1066a534
SP
522 {
523 __cpy (x, z, p);
524 return;
525 }
526
527 if (X[0] == Y[0])
528 {
529 if (__acr (x, y, p) > 0)
530 {
531 add_magnitudes (x, y, z, p);
532 Z[0] = X[0];
533 }
534 else
535 {
536 add_magnitudes (y, x, z, p);
537 Z[0] = Y[0];
538 }
539 }
540 else
541 {
542 if ((n = __acr (x, y, p)) == 1)
543 {
544 sub_magnitudes (x, y, z, p);
545 Z[0] = X[0];
546 }
547 else if (n == -1)
548 {
549 sub_magnitudes (y, x, z, p);
550 Z[0] = Y[0];
551 }
552 else
5739f705 553 Z[0] = 0;
1066a534 554 }
e4d82761
UD
555}
556
950c99ca
SP
557/* Subtract *Y from *X and return the result in *Z. X and Y may overlap but
558 not X and Z or Y and Z. One guard digit is used. The error is less than
559 one ULP. */
31d3cc00
UD
560void
561SECTION
1066a534
SP
562__sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
563{
e4d82761
UD
564 int n;
565
5739f705 566 if (X[0] == 0)
1066a534
SP
567 {
568 __cpy (y, z, p);
569 Z[0] = -Z[0];
570 return;
571 }
5739f705 572 else if (Y[0] == 0)
1066a534
SP
573 {
574 __cpy (x, z, p);
575 return;
576 }
577
578 if (X[0] != Y[0])
579 {
580 if (__acr (x, y, p) > 0)
581 {
582 add_magnitudes (x, y, z, p);
583 Z[0] = X[0];
584 }
585 else
586 {
587 add_magnitudes (y, x, z, p);
588 Z[0] = -Y[0];
589 }
590 }
591 else
592 {
593 if ((n = __acr (x, y, p)) == 1)
594 {
595 sub_magnitudes (x, y, z, p);
596 Z[0] = X[0];
597 }
598 else if (n == -1)
599 {
600 sub_magnitudes (y, x, z, p);
601 Z[0] = -Y[0];
602 }
603 else
5739f705 604 Z[0] = 0;
1066a534 605 }
e4d82761
UD
606}
607
82a9811d 608#ifndef NO__MUL
950c99ca
SP
609/* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X
610 and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P
611 digits. In case P > 3 the error is bounded by 1.001 ULP. */
31d3cc00
UD
612void
613SECTION
1066a534
SP
614__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
615{
e69804d1
SP
616 long i, j, k, ip, ip2;
617 long p2 = p;
6f2e90e7 618 mantissa_store_t zk;
2d0e0f29 619 const mp_no *a;
6f2e90e7 620 mantissa_store_t *diag;
e4d82761 621
44e0d4c2 622 /* Is z=0? */
5739f705 623 if (__glibc_unlikely (X[0] * Y[0] == 0))
44e0d4c2 624 {
5739f705 625 Z[0] = 0;
44e0d4c2
SP
626 return;
627 }
e4d82761 628
909279a5
SP
629 /* We need not iterate through all X's and Y's since it's pointless to
630 multiply zeroes. Here, both are zero... */
e69804d1 631 for (ip2 = p2; ip2 > 0; ip2--)
5739f705 632 if (X[ip2] != 0 || Y[ip2] != 0)
909279a5 633 break;
44e0d4c2 634
5739f705 635 a = X[ip2] != 0 ? y : x;
2d0e0f29 636
909279a5
SP
637 /* ... and here, at least one of them is still zero. */
638 for (ip = ip2; ip > 0; ip--)
5739f705 639 if (a->d[ip] != 0)
909279a5
SP
640 break;
641
642 /* The product looks like this for p = 3 (as an example):
643
644
645 a1 a2 a3
646 x b1 b2 b3
647 -----------------------------
648 a1*b3 a2*b3 a3*b3
649 a1*b2 a2*b2 a3*b2
650 a1*b1 a2*b1 a3*b1
651
652 So our K needs to ideally be P*2, but we're limiting ourselves to P + 3
653 for P >= 3. We compute the above digits in two parts; the last P-1
654 digits and then the first P digits. The last P-1 digits are a sum of
655 products of the input digits from P to P-k where K is 0 for the least
656 significant digit and increases as we go towards the left. The product
657 term is of the form X[k]*X[P-k] as can be seen in the above example.
658
659 The first P digits are also a sum of products with the same product term,
660 except that the sum is from 1 to k. This is also evident from the above
661 example.
662
663 Another thing that becomes evident is that only the most significant
664 ip+ip2 digits of the result are non-zero, where ip and ip2 are the
665 'internal precision' of the input numbers, i.e. digits after ip and ip2
5739f705 666 are all 0. */
909279a5 667
e69804d1 668 k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
909279a5
SP
669
670 while (k > ip + ip2 + 1)
5739f705 671 Z[k--] = 0;
909279a5 672
5739f705 673 zk = 0;
45f05884
SP
674
675 /* Precompute sums of diagonal elements so that we can directly use them
676 later. See the next comment to know we why need them. */
6f2e90e7 677 diag = alloca (k * sizeof (mantissa_store_t));
5739f705 678 mantissa_store_t d = 0;
45f05884
SP
679 for (i = 1; i <= ip; i++)
680 {
6f2e90e7 681 d += X[i] * (mantissa_store_t) Y[i];
45f05884
SP
682 diag[i] = d;
683 }
684 while (i < k)
685 diag[i++] = d;
909279a5 686
e69804d1 687 while (k > p2)
44e0d4c2 688 {
45f05884
SP
689 long lim = k / 2;
690
691 if (k % 2 == 0)
692 /* We want to add this only once, but since we subtract it in the sum
693 of products above, we add twice. */
6f2e90e7 694 zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
45f05884
SP
695
696 for (i = k - p2, j = p2; i < j; i++, j--)
6f2e90e7 697 zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
45f05884
SP
698
699 zk -= diag[k - 1];
44e0d4c2 700
6f2e90e7
SP
701 DIV_RADIX (zk, Z[k]);
702 k--;
44e0d4c2
SP
703 }
704
45f05884
SP
705 /* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
706 goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the
707 number of multiplications, we halve the range and if k is an even number,
708 add the diagonal element X[k/2]Y[k/2]. Through the half range, we compute
709 X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j].
710
711 This reduction tells us that we're summing two things, the first term
712 through the half range and the negative of the sum of the product of all
713 terms of X and Y in the full range. i.e.
714
715 SUM(X[i] * Y[i]) for k terms. This is precalculated above for each k in
716 a single loop so that it completes in O(n) time and can hence be directly
717 used in the loop below. */
44e0d4c2
SP
718 while (k > 1)
719 {
45f05884
SP
720 long lim = k / 2;
721
722 if (k % 2 == 0)
723 /* We want to add this only once, but since we subtract it in the sum
724 of products above, we add twice. */
6f2e90e7 725 zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
45f05884
SP
726
727 for (i = 1, j = k - 1; i < j; i++, j--)
6f2e90e7 728 zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
45f05884
SP
729
730 zk -= diag[k - 1];
44e0d4c2 731
6f2e90e7
SP
732 DIV_RADIX (zk, Z[k]);
733 k--;
44e0d4c2 734 }
2a91b573 735 Z[k] = zk;
44e0d4c2 736
4709fe76
SP
737 /* Get the exponent sum into an intermediate variable. This is a subtle
738 optimization, where given enough registers, all operations on the exponent
739 happen in registers and the result is written out only once into EZ. */
740 int e = EX + EY;
741
950c99ca 742 /* Is there a carry beyond the most significant digit? */
5739f705 743 if (__glibc_unlikely (Z[1] == 0))
44e0d4c2 744 {
e69804d1 745 for (i = 1; i <= p2; i++)
1066a534 746 Z[i] = Z[i + 1];
4709fe76 747 e--;
44e0d4c2 748 }
e4d82761 749
4709fe76 750 EZ = e;
e4d82761 751 Z[0] = X[0] * Y[0];
e4d82761 752}
82a9811d 753#endif
e4d82761 754
82a9811d 755#ifndef NO__SQR
d6752ccd
SP
756/* Square *X and store result in *Y. X and Y may not overlap. For P in
757 [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
758 error is bounded by 1.001 ULP. This is a faster special case of
759 multiplication. */
760void
761SECTION
762__sqr (const mp_no *x, mp_no *y, int p)
763{
764 long i, j, k, ip;
6f2e90e7 765 mantissa_store_t yk;
d6752ccd
SP
766
767 /* Is z=0? */
5739f705 768 if (__glibc_unlikely (X[0] == 0))
d6752ccd 769 {
5739f705 770 Y[0] = 0;
d6752ccd
SP
771 return;
772 }
773
774 /* We need not iterate through all X's since it's pointless to
775 multiply zeroes. */
776 for (ip = p; ip > 0; ip--)
5739f705 777 if (X[ip] != 0)
d6752ccd
SP
778 break;
779
780 k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
781
782 while (k > 2 * ip + 1)
5739f705 783 Y[k--] = 0;
d6752ccd 784
5739f705 785 yk = 0;
d6752ccd
SP
786
787 while (k > p)
788 {
6f2e90e7 789 mantissa_store_t yk2 = 0;
d6752ccd
SP
790 long lim = k / 2;
791
792 if (k % 2 == 0)
6f2e90e7 793 yk += X[lim] * (mantissa_store_t) X[lim];
d6752ccd 794
20cd7fb3
SP
795 /* In __mul, this loop (and the one within the next while loop) run
796 between a range to calculate the mantissa as follows:
797
798 Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
799 + X[n] * Y[k]
800
801 For X == Y, we can get away with summing halfway and doubling the
802 result. For cases where the range size is even, the mid-point needs
803 to be added separately (above). */
22af19f9 804 for (i = k - p, j = p; i < j; i++, j--)
6f2e90e7 805 yk2 += X[i] * (mantissa_store_t) X[j];
d6752ccd 806
5739f705 807 yk += 2 * yk2;
d6752ccd 808
6f2e90e7
SP
809 DIV_RADIX (yk, Y[k]);
810 k--;
d6752ccd
SP
811 }
812
813 while (k > 1)
814 {
6f2e90e7 815 mantissa_store_t yk2 = 0;
d6752ccd
SP
816 long lim = k / 2;
817
818 if (k % 2 == 0)
6f2e90e7 819 yk += X[lim] * (mantissa_store_t) X[lim];
d6752ccd 820
20cd7fb3 821 /* Likewise for this loop. */
22af19f9 822 for (i = 1, j = k - 1; i < j; i++, j--)
6f2e90e7 823 yk2 += X[i] * (mantissa_store_t) X[j];
d6752ccd 824
5739f705 825 yk += 2 * yk2;
d6752ccd 826
6f2e90e7
SP
827 DIV_RADIX (yk, Y[k]);
828 k--;
d6752ccd
SP
829 }
830 Y[k] = yk;
831
832 /* Squares are always positive. */
5739f705 833 Y[0] = 1;
d6752ccd 834
4709fe76
SP
835 /* Get the exponent sum into an intermediate variable. This is a subtle
836 optimization, where given enough registers, all operations on the exponent
837 happen in registers and the result is written out only once into EZ. */
838 int e = EX * 2;
839
d6752ccd 840 /* Is there a carry beyond the most significant digit? */
5739f705 841 if (__glibc_unlikely (Y[1] == 0))
d6752ccd
SP
842 {
843 for (i = 1; i <= p; i++)
844 Y[i] = Y[i + 1];
4709fe76 845 e--;
d6752ccd 846 }
4709fe76
SP
847
848 EY = e;
d6752ccd 849}
82a9811d 850#endif
d6752ccd 851
950c99ca
SP
852/* Invert *X and store in *Y. Relative error bound:
853 - For P = 2: 1.001 * R ^ (1 - P)
854 - For P = 3: 1.063 * R ^ (1 - P)
855 - For P > 3: 2.001 * R ^ (1 - P)
e4d82761 856
950c99ca 857 *X = 0 is not permissible. */
1066a534 858static void
31d3cc00 859SECTION
1066a534
SP
860__inv (const mp_no *x, mp_no *y, int p)
861{
e69804d1 862 long i;
e4d82761 863 double t;
1066a534
SP
864 mp_no z, w;
865 static const int np1[] =
866 { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
867 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
868 };
869
870 __cpy (x, &z, p);
871 z.e = 0;
872 __mp_dbl (&z, &t, p);
5739f705 873 t = 1 / t;
1066a534
SP
874 __dbl_mp (t, y, p);
875 EY -= EX;
876
877 for (i = 0; i < np1[p]; i++)
878 {
879 __cpy (y, &w, p);
880 __mul (x, &w, y, p);
107a5bf0 881 __sub (&__mptwo, y, &z, p);
1066a534
SP
882 __mul (&w, &z, y, p);
883 }
e4d82761
UD
884}
885
950c99ca
SP
886/* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z
887 or Y and Z. Relative error bound:
888 - For P = 2: 2.001 * R ^ (1 - P)
889 - For P = 3: 2.063 * R ^ (1 - P)
890 - For P > 3: 3.001 * R ^ (1 - P)
e4d82761 891
950c99ca 892 *X = 0 is not permissible. */
31d3cc00
UD
893void
894SECTION
1066a534
SP
895__dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
896{
e4d82761
UD
897 mp_no w;
898
5739f705
SP
899 if (X[0] == 0)
900 Z[0] = 0;
1066a534
SP
901 else
902 {
903 __inv (y, &w, p);
904 __mul (x, &w, z, p);
905 }
e4d82761 906}