]>
Commit | Line | Data |
---|---|---|
04067002 UD |
1 | |
2 | /* | |
3 | * IBM Accurate Mathematical Library | |
4 | * written by International Business Machines Corp. | |
b168057a | 5 | * Copyright (C) 2001-2015 Free Software Foundation, Inc. |
04067002 UD |
6 | * |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU Lesser General Public License as published by | |
9 | * the Free Software Foundation; either version 2.1 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public License | |
59ba27a6 | 18 | * along with this program; if not, see <http://www.gnu.org/licenses/>. |
04067002 | 19 | */ |
04067002 | 20 | |
82a9811d SP |
21 | /* Define __mul and __sqr and use the rest from generic code. */ |
22 | #define NO__MUL | |
23 | #define NO__SQR | |
04067002 | 24 | |
82a9811d | 25 | #include <sysdeps/ieee754/dbl-64/mpa.c> |
04067002 | 26 | |
950c99ca SP |
27 | /* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X |
28 | and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P | |
29 | digits. In case P > 3 the error is bounded by 1.001 ULP. */ | |
1066a534 SP |
30 | void |
31 | __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) | |
32 | { | |
04067002 UD |
33 | long i, i1, i2, j, k, k2; |
34 | long p2 = p; | |
35 | double u, zk, zk2; | |
36 | ||
20cd7fb3 | 37 | /* Is z=0? */ |
a64d7e0e | 38 | if (__glibc_unlikely (X[0] * Y[0] == 0)) |
1066a534 | 39 | { |
a64d7e0e | 40 | Z[0] = 0; |
1066a534 SP |
41 | return; |
42 | } | |
04067002 | 43 | |
1066a534 SP |
44 | /* Multiply, add and carry */ |
45 | k2 = (p2 < 3) ? p2 + p2 : p2 + 3; | |
a64d7e0e | 46 | zk = Z[k2] = 0; |
1066a534 | 47 | for (k = k2; k > 1;) |
04067002 | 48 | { |
1066a534 | 49 | if (k > p2) |
04067002 | 50 | { |
1066a534 SP |
51 | i1 = k - p2; |
52 | i2 = p2 + 1; | |
04067002 | 53 | } |
1066a534 | 54 | else |
04067002 | 55 | { |
1066a534 SP |
56 | i1 = 1; |
57 | i2 = k; | |
58 | } | |
59 | #if 1 | |
60 | /* Rearrange this inner loop to allow the fmadd instructions to be | |
61 | independent and execute in parallel on processors that have | |
62 | dual symmetrical FP pipelines. */ | |
63 | if (i1 < (i2 - 1)) | |
64 | { | |
65 | /* Make sure we have at least 2 iterations. */ | |
66 | if (((i2 - i1) & 1L) == 1L) | |
67 | { | |
68 | /* Handle the odd iterations case. */ | |
69 | zk2 = x->d[i2 - 1] * y->d[i1]; | |
70 | } | |
71 | else | |
72 | zk2 = 0.0; | |
73 | /* Do two multiply/adds per loop iteration, using independent | |
74 | accumulators; zk and zk2. */ | |
75 | for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2) | |
76 | { | |
77 | zk += x->d[i] * y->d[j]; | |
78 | zk2 += x->d[i + 1] * y->d[j - 1]; | |
79 | } | |
80 | zk += zk2; /* Final sum. */ | |
81 | } | |
82 | else | |
83 | { | |
84 | /* Special case when iterations is 1. */ | |
85 | zk += x->d[i1] * y->d[i1]; | |
04067002 | 86 | } |
04067002 | 87 | #else |
1066a534 SP |
88 | /* The original code. */ |
89 | for (i = i1, j = i2 - 1; i < i2; i++, j--) | |
90 | zk += X[i] * Y[j]; | |
04067002 UD |
91 | #endif |
92 | ||
1066a534 SP |
93 | u = (zk + CUTTER) - CUTTER; |
94 | if (u > zk) | |
95 | u -= RADIX; | |
96 | Z[k] = zk - u; | |
97 | zk = u * RADIXI; | |
98 | --k; | |
99 | } | |
04067002 UD |
100 | Z[k] = zk; |
101 | ||
e6ebd4a7 | 102 | int e = EX + EY; |
950c99ca | 103 | /* Is there a carry beyond the most significant digit? */ |
a64d7e0e | 104 | if (Z[1] == 0) |
1066a534 SP |
105 | { |
106 | for (i = 1; i <= p2; i++) | |
107 | Z[i] = Z[i + 1]; | |
e6ebd4a7 | 108 | e--; |
1066a534 | 109 | } |
04067002 | 110 | |
e6ebd4a7 | 111 | EZ = e; |
04067002 | 112 | Z[0] = X[0] * Y[0]; |
04067002 UD |
113 | } |
114 | ||
d6752ccd SP |
115 | /* Square *X and store result in *Y. X and Y may not overlap. For P in |
116 | [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the | |
117 | error is bounded by 1.001 ULP. This is a faster special case of | |
118 | multiplication. */ | |
119 | void | |
120 | __sqr (const mp_no *x, mp_no *y, int p) | |
121 | { | |
122 | long i, j, k, ip; | |
123 | double u, yk; | |
124 | ||
125 | /* Is z=0? */ | |
a64d7e0e | 126 | if (__glibc_unlikely (X[0] == 0)) |
d6752ccd | 127 | { |
a64d7e0e | 128 | Y[0] = 0; |
d6752ccd SP |
129 | return; |
130 | } | |
131 | ||
132 | /* We need not iterate through all X's since it's pointless to | |
133 | multiply zeroes. */ | |
134 | for (ip = p; ip > 0; ip--) | |
a64d7e0e | 135 | if (X[ip] != 0) |
d6752ccd SP |
136 | break; |
137 | ||
138 | k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; | |
139 | ||
140 | while (k > 2 * ip + 1) | |
a64d7e0e | 141 | Y[k--] = 0; |
d6752ccd | 142 | |
a64d7e0e | 143 | yk = 0; |
d6752ccd SP |
144 | |
145 | while (k > p) | |
146 | { | |
147 | double yk2 = 0.0; | |
148 | long lim = k / 2; | |
149 | ||
150 | if (k % 2 == 0) | |
151 | { | |
152 | yk += X[lim] * X[lim]; | |
153 | lim--; | |
154 | } | |
155 | ||
156 | /* In __mul, this loop (and the one within the next while loop) run | |
157 | between a range to calculate the mantissa as follows: | |
158 | ||
159 | Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] | |
160 | + X[n] * Y[k] | |
161 | ||
162 | For X == Y, we can get away with summing halfway and doubling the | |
163 | result. For cases where the range size is even, the mid-point needs | |
164 | to be added separately (above). */ | |
165 | for (i = k - p, j = p; i <= lim; i++, j--) | |
166 | yk2 += X[i] * X[j]; | |
167 | ||
168 | yk += 2.0 * yk2; | |
169 | ||
170 | u = (yk + CUTTER) - CUTTER; | |
171 | if (u > yk) | |
172 | u -= RADIX; | |
173 | Y[k--] = yk - u; | |
174 | yk = u * RADIXI; | |
175 | } | |
176 | ||
177 | while (k > 1) | |
178 | { | |
179 | double yk2 = 0.0; | |
180 | long lim = k / 2; | |
181 | ||
182 | if (k % 2 == 0) | |
183 | { | |
184 | yk += X[lim] * X[lim]; | |
185 | lim--; | |
186 | } | |
187 | ||
188 | /* Likewise for this loop. */ | |
189 | for (i = 1, j = k - 1; i <= lim; i++, j--) | |
190 | yk2 += X[i] * X[j]; | |
191 | ||
192 | yk += 2.0 * yk2; | |
193 | ||
194 | u = (yk + CUTTER) - CUTTER; | |
195 | if (u > yk) | |
196 | u -= RADIX; | |
197 | Y[k--] = yk - u; | |
198 | yk = u * RADIXI; | |
199 | } | |
200 | Y[k] = yk; | |
201 | ||
202 | /* Squares are always positive. */ | |
203 | Y[0] = 1.0; | |
204 | ||
e6ebd4a7 | 205 | int e = EX * 2; |
d6752ccd | 206 | /* Is there a carry beyond the most significant digit? */ |
a64d7e0e | 207 | if (__glibc_unlikely (Y[1] == 0)) |
d6752ccd SP |
208 | { |
209 | for (i = 1; i <= p; i++) | |
210 | Y[i] = Y[i + 1]; | |
e6ebd4a7 | 211 | e--; |
d6752ccd | 212 | } |
e6ebd4a7 | 213 | EY = e; |
d6752ccd | 214 | } |