]>
Commit | Line | Data |
---|---|---|
dfeab068 RE |
1 | /* crypto/bn/bn_asm.c */ |
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | |
3 | * All rights reserved. | |
4 | * | |
5 | * This package is an SSL implementation written | |
6 | * by Eric Young (eay@cryptsoft.com). | |
7 | * The implementation was written so as to conform with Netscapes SSL. | |
0f113f3e | 8 | * |
dfeab068 RE |
9 | * This library is free for commercial and non-commercial use as long as |
10 | * the following conditions are aheared to. The following conditions | |
11 | * apply to all code found in this distribution, be it the RC4, RSA, | |
12 | * lhash, DES, etc., code; not just the SSL code. The SSL documentation | |
13 | * included with this distribution is covered by the same copyright terms | |
14 | * except that the holder is Tim Hudson (tjh@cryptsoft.com). | |
0f113f3e | 15 | * |
dfeab068 RE |
16 | * Copyright remains Eric Young's, and as such any Copyright notices in |
17 | * the code are not to be removed. | |
18 | * If this package is used in a product, Eric Young should be given attribution | |
19 | * as the author of the parts of the library used. | |
20 | * This can be in the form of a textual message at program startup or | |
21 | * in documentation (online or textual) provided with the package. | |
0f113f3e | 22 | * |
dfeab068 RE |
23 | * Redistribution and use in source and binary forms, with or without |
24 | * modification, are permitted provided that the following conditions | |
25 | * are met: | |
26 | * 1. Redistributions of source code must retain the copyright | |
27 | * notice, this list of conditions and the following disclaimer. | |
28 | * 2. Redistributions in binary form must reproduce the above copyright | |
29 | * notice, this list of conditions and the following disclaimer in the | |
30 | * documentation and/or other materials provided with the distribution. | |
31 | * 3. All advertising materials mentioning features or use of this software | |
32 | * must display the following acknowledgement: | |
33 | * "This product includes cryptographic software written by | |
34 | * Eric Young (eay@cryptsoft.com)" | |
35 | * The word 'cryptographic' can be left out if the rouines from the library | |
36 | * being used are not cryptographic related :-). | |
0f113f3e | 37 | * 4. If you include any Windows specific code (or a derivative thereof) from |
dfeab068 RE |
38 | * the apps directory (application code) you must include an acknowledgement: |
39 | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" | |
0f113f3e | 40 | * |
dfeab068 RE |
41 | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND |
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
51 | * SUCH DAMAGE. | |
0f113f3e | 52 | * |
dfeab068 RE |
53 | * The licence and distribution terms for any publically available version or |
54 | * derivative of this code cannot be changed. i.e. this code cannot simply be | |
55 | * copied and put under another distribution licence | |
56 | * [including the GNU Public Licence.] | |
57 | */ | |
58 | ||
775c63fc | 59 | #ifndef BN_DEBUG |
0f113f3e | 60 | # undef NDEBUG /* avoid conflicting definitions */ |
775c63fc UM |
61 | # define NDEBUG |
62 | #endif | |
63 | ||
775c63fc | 64 | #include <assert.h> |
d749e108 | 65 | #include <openssl/crypto.h> |
b39fc560 | 66 | #include "internal/cryptlib.h" |
dfeab068 RE |
67 | #include "bn_lcl.h" |
68 | ||
fb81ac5e | 69 | #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) |
dfeab068 | 70 | |
0f113f3e MC |
71 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
72 | BN_ULONG w) | |
73 | { | |
74 | BN_ULONG c1 = 0; | |
75 | ||
76 | assert(num >= 0); | |
77 | if (num <= 0) | |
78 | return (c1); | |
79 | ||
80 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
81 | while (num & ~3) { | |
82 | mul_add(rp[0], ap[0], w, c1); | |
83 | mul_add(rp[1], ap[1], w, c1); | |
84 | mul_add(rp[2], ap[2], w, c1); | |
85 | mul_add(rp[3], ap[3], w, c1); | |
86 | ap += 4; | |
87 | rp += 4; | |
88 | num -= 4; | |
89 | } | |
90 | # endif | |
91 | while (num) { | |
92 | mul_add(rp[0], ap[0], w, c1); | |
93 | ap++; | |
94 | rp++; | |
95 | num--; | |
96 | } | |
97 | ||
98 | return (c1); | |
99 | } | |
dfeab068 | 100 | |
2fbc8a2a | 101 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
0f113f3e MC |
102 | { |
103 | BN_ULONG c1 = 0; | |
104 | ||
105 | assert(num >= 0); | |
106 | if (num <= 0) | |
107 | return (c1); | |
108 | ||
109 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
110 | while (num & ~3) { | |
111 | mul(rp[0], ap[0], w, c1); | |
112 | mul(rp[1], ap[1], w, c1); | |
113 | mul(rp[2], ap[2], w, c1); | |
114 | mul(rp[3], ap[3], w, c1); | |
115 | ap += 4; | |
116 | rp += 4; | |
117 | num -= 4; | |
118 | } | |
119 | # endif | |
120 | while (num) { | |
121 | mul(rp[0], ap[0], w, c1); | |
122 | ap++; | |
123 | rp++; | |
124 | num--; | |
125 | } | |
126 | return (c1); | |
127 | } | |
dfeab068 | 128 | |
2fbc8a2a | 129 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
0f113f3e MC |
130 | { |
131 | assert(n >= 0); | |
132 | if (n <= 0) | |
133 | return; | |
134 | ||
135 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
136 | while (n & ~3) { | |
137 | sqr(r[0], r[1], a[0]); | |
138 | sqr(r[2], r[3], a[1]); | |
139 | sqr(r[4], r[5], a[2]); | |
140 | sqr(r[6], r[7], a[3]); | |
141 | a += 4; | |
142 | r += 8; | |
143 | n -= 4; | |
144 | } | |
145 | # endif | |
146 | while (n) { | |
147 | sqr(r[0], r[1], a[0]); | |
148 | a++; | |
149 | r += 2; | |
150 | n--; | |
151 | } | |
152 | } | |
153 | ||
154 | #else /* !(defined(BN_LLONG) || | |
155 | * defined(BN_UMULT_HIGH)) */ | |
156 | ||
157 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, | |
158 | BN_ULONG w) | |
159 | { | |
160 | BN_ULONG c = 0; | |
161 | BN_ULONG bl, bh; | |
162 | ||
163 | assert(num >= 0); | |
164 | if (num <= 0) | |
165 | return ((BN_ULONG)0); | |
166 | ||
167 | bl = LBITS(w); | |
168 | bh = HBITS(w); | |
169 | ||
170 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
171 | while (num & ~3) { | |
172 | mul_add(rp[0], ap[0], bl, bh, c); | |
173 | mul_add(rp[1], ap[1], bl, bh, c); | |
174 | mul_add(rp[2], ap[2], bl, bh, c); | |
175 | mul_add(rp[3], ap[3], bl, bh, c); | |
176 | ap += 4; | |
177 | rp += 4; | |
178 | num -= 4; | |
179 | } | |
180 | # endif | |
181 | while (num) { | |
182 | mul_add(rp[0], ap[0], bl, bh, c); | |
183 | ap++; | |
184 | rp++; | |
185 | num--; | |
186 | } | |
187 | return (c); | |
188 | } | |
dfeab068 | 189 | |
6343829a | 190 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
0f113f3e MC |
191 | { |
192 | BN_ULONG carry = 0; | |
193 | BN_ULONG bl, bh; | |
194 | ||
195 | assert(num >= 0); | |
196 | if (num <= 0) | |
197 | return ((BN_ULONG)0); | |
198 | ||
199 | bl = LBITS(w); | |
200 | bh = HBITS(w); | |
201 | ||
202 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
203 | while (num & ~3) { | |
204 | mul(rp[0], ap[0], bl, bh, carry); | |
205 | mul(rp[1], ap[1], bl, bh, carry); | |
206 | mul(rp[2], ap[2], bl, bh, carry); | |
207 | mul(rp[3], ap[3], bl, bh, carry); | |
208 | ap += 4; | |
209 | rp += 4; | |
210 | num -= 4; | |
211 | } | |
212 | # endif | |
213 | while (num) { | |
214 | mul(rp[0], ap[0], bl, bh, carry); | |
215 | ap++; | |
216 | rp++; | |
217 | num--; | |
218 | } | |
219 | return (carry); | |
220 | } | |
dfeab068 | 221 | |
6343829a | 222 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
0f113f3e MC |
223 | { |
224 | assert(n >= 0); | |
225 | if (n <= 0) | |
226 | return; | |
227 | ||
228 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
229 | while (n & ~3) { | |
230 | sqr64(r[0], r[1], a[0]); | |
231 | sqr64(r[2], r[3], a[1]); | |
232 | sqr64(r[4], r[5], a[2]); | |
233 | sqr64(r[6], r[7], a[3]); | |
234 | a += 4; | |
235 | r += 8; | |
236 | n -= 4; | |
237 | } | |
238 | # endif | |
239 | while (n) { | |
240 | sqr64(r[0], r[1], a[0]); | |
241 | a++; | |
242 | r += 2; | |
243 | n--; | |
244 | } | |
245 | } | |
246 | ||
247 | #endif /* !(defined(BN_LLONG) || | |
248 | * defined(BN_UMULT_HIGH)) */ | |
dfeab068 RE |
249 | |
250 | #if defined(BN_LLONG) && defined(BN_DIV2W) | |
251 | ||
6b691a5c | 252 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
0f113f3e MC |
253 | { |
254 | return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); | |
255 | } | |
dfeab068 RE |
256 | |
257 | #else | |
258 | ||
bbb8de09 | 259 | /* Divide h,l by d and return the result. */ |
dfeab068 | 260 | /* I need to test this some more :-( */ |
6b691a5c | 261 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
0f113f3e MC |
262 | { |
263 | BN_ULONG dh, dl, q, ret = 0, th, tl, t; | |
264 | int i, count = 2; | |
265 | ||
266 | if (d == 0) | |
267 | return (BN_MASK2); | |
268 | ||
269 | i = BN_num_bits_word(d); | |
270 | assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); | |
271 | ||
272 | i = BN_BITS2 - i; | |
273 | if (h >= d) | |
274 | h -= d; | |
275 | ||
276 | if (i) { | |
277 | d <<= i; | |
278 | h = (h << i) | (l >> (BN_BITS2 - i)); | |
279 | l <<= i; | |
280 | } | |
281 | dh = (d & BN_MASK2h) >> BN_BITS4; | |
282 | dl = (d & BN_MASK2l); | |
283 | for (;;) { | |
284 | if ((h >> BN_BITS4) == dh) | |
285 | q = BN_MASK2l; | |
286 | else | |
287 | q = h / dh; | |
288 | ||
289 | th = q * dh; | |
290 | tl = dl * q; | |
291 | for (;;) { | |
292 | t = h - th; | |
293 | if ((t & BN_MASK2h) || | |
294 | ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) | |
295 | break; | |
296 | q--; | |
297 | th -= dh; | |
298 | tl -= dl; | |
299 | } | |
300 | t = (tl >> BN_BITS4); | |
301 | tl = (tl << BN_BITS4) & BN_MASK2h; | |
302 | th += t; | |
303 | ||
304 | if (l < tl) | |
305 | th++; | |
306 | l -= tl; | |
307 | if (h < th) { | |
308 | h += d; | |
309 | q--; | |
310 | } | |
311 | h -= th; | |
312 | ||
313 | if (--count == 0) | |
314 | break; | |
315 | ||
316 | ret = q << BN_BITS4; | |
317 | h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; | |
318 | l = (l & BN_MASK2l) << BN_BITS4; | |
319 | } | |
320 | ret |= q; | |
321 | return (ret); | |
322 | } | |
323 | #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ | |
dfeab068 RE |
324 | |
325 | #ifdef BN_LLONG | |
0f113f3e MC |
326 | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
327 | int n) | |
328 | { | |
329 | BN_ULLONG ll = 0; | |
330 | ||
331 | assert(n >= 0); | |
332 | if (n <= 0) | |
333 | return ((BN_ULONG)0); | |
334 | ||
335 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
336 | while (n & ~3) { | |
337 | ll += (BN_ULLONG) a[0] + b[0]; | |
338 | r[0] = (BN_ULONG)ll & BN_MASK2; | |
339 | ll >>= BN_BITS2; | |
340 | ll += (BN_ULLONG) a[1] + b[1]; | |
341 | r[1] = (BN_ULONG)ll & BN_MASK2; | |
342 | ll >>= BN_BITS2; | |
343 | ll += (BN_ULLONG) a[2] + b[2]; | |
344 | r[2] = (BN_ULONG)ll & BN_MASK2; | |
345 | ll >>= BN_BITS2; | |
346 | ll += (BN_ULLONG) a[3] + b[3]; | |
347 | r[3] = (BN_ULONG)ll & BN_MASK2; | |
348 | ll >>= BN_BITS2; | |
349 | a += 4; | |
350 | b += 4; | |
351 | r += 4; | |
352 | n -= 4; | |
353 | } | |
354 | # endif | |
355 | while (n) { | |
356 | ll += (BN_ULLONG) a[0] + b[0]; | |
357 | r[0] = (BN_ULONG)ll & BN_MASK2; | |
358 | ll >>= BN_BITS2; | |
359 | a++; | |
360 | b++; | |
361 | r++; | |
362 | n--; | |
363 | } | |
364 | return ((BN_ULONG)ll); | |
365 | } | |
366 | #else /* !BN_LLONG */ | |
367 | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, | |
368 | int n) | |
369 | { | |
370 | BN_ULONG c, l, t; | |
371 | ||
372 | assert(n >= 0); | |
373 | if (n <= 0) | |
374 | return ((BN_ULONG)0); | |
375 | ||
376 | c = 0; | |
377 | # ifndef OPENSSL_SMALL_FOOTPRINT | |
378 | while (n & ~3) { | |
379 | t = a[0]; | |
380 | t = (t + c) & BN_MASK2; | |
381 | c = (t < c); | |
382 | l = (t + b[0]) & BN_MASK2; | |
383 | c += (l < t); | |
384 | r[0] = l; | |
385 | t = a[1]; | |
386 | t = (t + c) & BN_MASK2; | |
387 | c = (t < c); | |
388 | l = (t + b[1]) & BN_MASK2; | |
389 | c += (l < t); | |
390 | r[1] = l; | |
391 | t = a[2]; | |
392 | t = (t + c) & BN_MASK2; | |
393 | c = (t < c); | |
394 | l = (t + b[2]) & BN_MASK2; | |
395 | c += (l < t); | |
396 | r[2] = l; | |
397 | t = a[3]; | |
398 | t = (t + c) & BN_MASK2; | |
399 | c = (t < c); | |
400 | l = (t + b[3]) & BN_MASK2; | |
401 | c += (l < t); | |
402 | r[3] = l; | |
403 | a += 4; | |
404 | b += 4; | |
405 | r += 4; | |
406 | n -= 4; | |
407 | } | |
408 | # endif | |
409 | while (n) { | |
410 | t = a[0]; | |
411 | t = (t + c) & BN_MASK2; | |
412 | c = (t < c); | |
413 | l = (t + b[0]) & BN_MASK2; | |
414 | c += (l < t); | |
415 | r[0] = l; | |
416 | a++; | |
417 | b++; | |
418 | r++; | |
419 | n--; | |
420 | } | |
421 | return ((BN_ULONG)c); | |
422 | } | |
423 | #endif /* !BN_LLONG */ | |
424 | ||
425 | BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, | |
426 | int n) | |
427 | { | |
428 | BN_ULONG t1, t2; | |
429 | int c = 0; | |
430 | ||
431 | assert(n >= 0); | |
432 | if (n <= 0) | |
433 | return ((BN_ULONG)0); | |
dfeab068 | 434 | |
682b112a | 435 | #ifndef OPENSSL_SMALL_FOOTPRINT |
0f113f3e MC |
436 | while (n & ~3) { |
437 | t1 = a[0]; | |
438 | t2 = b[0]; | |
439 | r[0] = (t1 - t2 - c) & BN_MASK2; | |
440 | if (t1 != t2) | |
441 | c = (t1 < t2); | |
442 | t1 = a[1]; | |
443 | t2 = b[1]; | |
444 | r[1] = (t1 - t2 - c) & BN_MASK2; | |
445 | if (t1 != t2) | |
446 | c = (t1 < t2); | |
447 | t1 = a[2]; | |
448 | t2 = b[2]; | |
449 | r[2] = (t1 - t2 - c) & BN_MASK2; | |
450 | if (t1 != t2) | |
451 | c = (t1 < t2); | |
452 | t1 = a[3]; | |
453 | t2 = b[3]; | |
454 | r[3] = (t1 - t2 - c) & BN_MASK2; | |
455 | if (t1 != t2) | |
456 | c = (t1 < t2); | |
457 | a += 4; | |
458 | b += 4; | |
459 | r += 4; | |
460 | n -= 4; | |
461 | } | |
682b112a | 462 | #endif |
0f113f3e MC |
463 | while (n) { |
464 | t1 = a[0]; | |
465 | t2 = b[0]; | |
466 | r[0] = (t1 - t2 - c) & BN_MASK2; | |
467 | if (t1 != t2) | |
468 | c = (t1 < t2); | |
469 | a++; | |
470 | b++; | |
471 | r++; | |
472 | n--; | |
473 | } | |
474 | return (c); | |
475 | } | |
dfeab068 | 476 | |
682b112a | 477 | #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
dfeab068 | 478 | |
0f113f3e MC |
479 | # undef bn_mul_comba8 |
480 | # undef bn_mul_comba4 | |
481 | # undef bn_sqr_comba8 | |
482 | # undef bn_sqr_comba4 | |
dfeab068 | 483 | |
775c63fc UM |
484 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
485 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | |
486 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | |
0f113f3e MC |
487 | /* |
488 | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number | |
489 | * c=(c2,c1,c0) | |
490 | */ | |
775c63fc | 491 | |
0f113f3e | 492 | # ifdef BN_LLONG |
a7a44ba5 AP |
493 | /* |
494 | * Keep in mind that additions to multiplication result can not | |
495 | * overflow, because its high half cannot be all-ones. | |
496 | */ | |
0f113f3e MC |
497 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
498 | BN_ULONG hi; \ | |
499 | BN_ULLONG t = (BN_ULLONG)(a)*(b); \ | |
500 | t += c0; /* no carry */ \ | |
501 | c0 = (BN_ULONG)Lw(t); \ | |
502 | hi = (BN_ULONG)Hw(t); \ | |
503 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
504 | } while(0) | |
505 | ||
506 | # define mul_add_c2(a,b,c0,c1,c2) do { \ | |
507 | BN_ULONG hi; \ | |
508 | BN_ULLONG t = (BN_ULLONG)(a)*(b); \ | |
509 | BN_ULLONG tt = t+c0; /* no carry */ \ | |
510 | c0 = (BN_ULONG)Lw(tt); \ | |
511 | hi = (BN_ULONG)Hw(tt); \ | |
512 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
513 | t += c0; /* no carry */ \ | |
514 | c0 = (BN_ULONG)Lw(t); \ | |
515 | hi = (BN_ULONG)Hw(t); \ | |
516 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
517 | } while(0) | |
518 | ||
519 | # define sqr_add_c(a,i,c0,c1,c2) do { \ | |
520 | BN_ULONG hi; \ | |
521 | BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ | |
522 | t += c0; /* no carry */ \ | |
523 | c0 = (BN_ULONG)Lw(t); \ | |
524 | hi = (BN_ULONG)Hw(t); \ | |
525 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
526 | } while(0) | |
527 | ||
528 | # define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
529 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
530 | ||
531 | # elif defined(BN_UMULT_LOHI) | |
a7a44ba5 AP |
532 | /* |
533 | * Keep in mind that additions to hi can not overflow, because | |
534 | * the high word of a multiplication result cannot be all-ones. | |
535 | */ | |
0f113f3e MC |
536 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
537 | BN_ULONG ta = (a), tb = (b); \ | |
538 | BN_ULONG lo, hi; \ | |
539 | BN_UMULT_LOHI(lo,hi,ta,tb); \ | |
540 | c0 += lo; hi += (c0<lo)?1:0; \ | |
541 | c1 += hi; c2 += (c1<hi)?1:0; \ | |
542 | } while(0) | |
543 | ||
544 | # define mul_add_c2(a,b,c0,c1,c2) do { \ | |
545 | BN_ULONG ta = (a), tb = (b); \ | |
546 | BN_ULONG lo, hi, tt; \ | |
547 | BN_UMULT_LOHI(lo,hi,ta,tb); \ | |
548 | c0 += lo; tt = hi+((c0<lo)?1:0); \ | |
549 | c1 += tt; c2 += (c1<tt)?1:0; \ | |
550 | c0 += lo; hi += (c0<lo)?1:0; \ | |
551 | c1 += hi; c2 += (c1<hi)?1:0; \ | |
552 | } while(0) | |
553 | ||
554 | # define sqr_add_c(a,i,c0,c1,c2) do { \ | |
555 | BN_ULONG ta = (a)[i]; \ | |
556 | BN_ULONG lo, hi; \ | |
557 | BN_UMULT_LOHI(lo,hi,ta,ta); \ | |
558 | c0 += lo; hi += (c0<lo)?1:0; \ | |
559 | c1 += hi; c2 += (c1<hi)?1:0; \ | |
560 | } while(0) | |
561 | ||
562 | # define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
563 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
564 | ||
565 | # elif defined(BN_UMULT_HIGH) | |
a7a44ba5 AP |
566 | /* |
567 | * Keep in mind that additions to hi can not overflow, because | |
568 | * the high word of a multiplication result cannot be all-ones. | |
569 | */ | |
0f113f3e MC |
570 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
571 | BN_ULONG ta = (a), tb = (b); \ | |
572 | BN_ULONG lo = ta * tb; \ | |
573 | BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ | |
574 | c0 += lo; hi += (c0<lo)?1:0; \ | |
575 | c1 += hi; c2 += (c1<hi)?1:0; \ | |
576 | } while(0) | |
577 | ||
578 | # define mul_add_c2(a,b,c0,c1,c2) do { \ | |
579 | BN_ULONG ta = (a), tb = (b), tt; \ | |
580 | BN_ULONG lo = ta * tb; \ | |
581 | BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ | |
582 | c0 += lo; tt = hi + ((c0<lo)?1:0); \ | |
583 | c1 += tt; c2 += (c1<tt)?1:0; \ | |
584 | c0 += lo; hi += (c0<lo)?1:0; \ | |
585 | c1 += hi; c2 += (c1<hi)?1:0; \ | |
586 | } while(0) | |
587 | ||
588 | # define sqr_add_c(a,i,c0,c1,c2) do { \ | |
589 | BN_ULONG ta = (a)[i]; \ | |
590 | BN_ULONG lo = ta * ta; \ | |
591 | BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ | |
592 | c0 += lo; hi += (c0<lo)?1:0; \ | |
593 | c1 += hi; c2 += (c1<hi)?1:0; \ | |
594 | } while(0) | |
595 | ||
596 | # define sqr_add_c2(a,i,j,c0,c1,c2) \ | |
597 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | |
598 | ||
599 | # else /* !BN_LLONG */ | |
a7a44ba5 AP |
600 | /* |
601 | * Keep in mind that additions to hi can not overflow, because | |
602 | * the high word of a multiplication result cannot be all-ones. | |
603 | */ | |
0f113f3e MC |
604 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
605 | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ | |
606 | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ | |
607 | mul64(lo,hi,bl,bh); \ | |
608 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ | |
609 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
610 | } while(0) | |
611 | ||
612 | # define mul_add_c2(a,b,c0,c1,c2) do { \ | |
613 | BN_ULONG tt; \ | |
614 | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ | |
615 | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ | |
616 | mul64(lo,hi,bl,bh); \ | |
617 | tt = hi; \ | |
618 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ | |
619 | c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ | |
620 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ | |
621 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
622 | } while(0) | |
623 | ||
624 | # define sqr_add_c(a,i,c0,c1,c2) do { \ | |
625 | BN_ULONG lo, hi; \ | |
626 | sqr64(lo,hi,(a)[i]); \ | |
627 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ | |
628 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ | |
629 | } while(0) | |
dfeab068 | 630 | |
7cc63545 | 631 | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
0f113f3e MC |
632 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
633 | # endif /* !BN_LLONG */ | |
dfeab068 | 634 | |
6b691a5c | 635 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
0f113f3e MC |
636 | { |
637 | BN_ULONG c1, c2, c3; | |
638 | ||
639 | c1 = 0; | |
640 | c2 = 0; | |
641 | c3 = 0; | |
642 | mul_add_c(a[0], b[0], c1, c2, c3); | |
643 | r[0] = c1; | |
644 | c1 = 0; | |
645 | mul_add_c(a[0], b[1], c2, c3, c1); | |
646 | mul_add_c(a[1], b[0], c2, c3, c1); | |
647 | r[1] = c2; | |
648 | c2 = 0; | |
649 | mul_add_c(a[2], b[0], c3, c1, c2); | |
650 | mul_add_c(a[1], b[1], c3, c1, c2); | |
651 | mul_add_c(a[0], b[2], c3, c1, c2); | |
652 | r[2] = c3; | |
653 | c3 = 0; | |
654 | mul_add_c(a[0], b[3], c1, c2, c3); | |
655 | mul_add_c(a[1], b[2], c1, c2, c3); | |
656 | mul_add_c(a[2], b[1], c1, c2, c3); | |
657 | mul_add_c(a[3], b[0], c1, c2, c3); | |
658 | r[3] = c1; | |
659 | c1 = 0; | |
660 | mul_add_c(a[4], b[0], c2, c3, c1); | |
661 | mul_add_c(a[3], b[1], c2, c3, c1); | |
662 | mul_add_c(a[2], b[2], c2, c3, c1); | |
663 | mul_add_c(a[1], b[3], c2, c3, c1); | |
664 | mul_add_c(a[0], b[4], c2, c3, c1); | |
665 | r[4] = c2; | |
666 | c2 = 0; | |
667 | mul_add_c(a[0], b[5], c3, c1, c2); | |
668 | mul_add_c(a[1], b[4], c3, c1, c2); | |
669 | mul_add_c(a[2], b[3], c3, c1, c2); | |
670 | mul_add_c(a[3], b[2], c3, c1, c2); | |
671 | mul_add_c(a[4], b[1], c3, c1, c2); | |
672 | mul_add_c(a[5], b[0], c3, c1, c2); | |
673 | r[5] = c3; | |
674 | c3 = 0; | |
675 | mul_add_c(a[6], b[0], c1, c2, c3); | |
676 | mul_add_c(a[5], b[1], c1, c2, c3); | |
677 | mul_add_c(a[4], b[2], c1, c2, c3); | |
678 | mul_add_c(a[3], b[3], c1, c2, c3); | |
679 | mul_add_c(a[2], b[4], c1, c2, c3); | |
680 | mul_add_c(a[1], b[5], c1, c2, c3); | |
681 | mul_add_c(a[0], b[6], c1, c2, c3); | |
682 | r[6] = c1; | |
683 | c1 = 0; | |
684 | mul_add_c(a[0], b[7], c2, c3, c1); | |
685 | mul_add_c(a[1], b[6], c2, c3, c1); | |
686 | mul_add_c(a[2], b[5], c2, c3, c1); | |
687 | mul_add_c(a[3], b[4], c2, c3, c1); | |
688 | mul_add_c(a[4], b[3], c2, c3, c1); | |
689 | mul_add_c(a[5], b[2], c2, c3, c1); | |
690 | mul_add_c(a[6], b[1], c2, c3, c1); | |
691 | mul_add_c(a[7], b[0], c2, c3, c1); | |
692 | r[7] = c2; | |
693 | c2 = 0; | |
694 | mul_add_c(a[7], b[1], c3, c1, c2); | |
695 | mul_add_c(a[6], b[2], c3, c1, c2); | |
696 | mul_add_c(a[5], b[3], c3, c1, c2); | |
697 | mul_add_c(a[4], b[4], c3, c1, c2); | |
698 | mul_add_c(a[3], b[5], c3, c1, c2); | |
699 | mul_add_c(a[2], b[6], c3, c1, c2); | |
700 | mul_add_c(a[1], b[7], c3, c1, c2); | |
701 | r[8] = c3; | |
702 | c3 = 0; | |
703 | mul_add_c(a[2], b[7], c1, c2, c3); | |
704 | mul_add_c(a[3], b[6], c1, c2, c3); | |
705 | mul_add_c(a[4], b[5], c1, c2, c3); | |
706 | mul_add_c(a[5], b[4], c1, c2, c3); | |
707 | mul_add_c(a[6], b[3], c1, c2, c3); | |
708 | mul_add_c(a[7], b[2], c1, c2, c3); | |
709 | r[9] = c1; | |
710 | c1 = 0; | |
711 | mul_add_c(a[7], b[3], c2, c3, c1); | |
712 | mul_add_c(a[6], b[4], c2, c3, c1); | |
713 | mul_add_c(a[5], b[5], c2, c3, c1); | |
714 | mul_add_c(a[4], b[6], c2, c3, c1); | |
715 | mul_add_c(a[3], b[7], c2, c3, c1); | |
716 | r[10] = c2; | |
717 | c2 = 0; | |
718 | mul_add_c(a[4], b[7], c3, c1, c2); | |
719 | mul_add_c(a[5], b[6], c3, c1, c2); | |
720 | mul_add_c(a[6], b[5], c3, c1, c2); | |
721 | mul_add_c(a[7], b[4], c3, c1, c2); | |
722 | r[11] = c3; | |
723 | c3 = 0; | |
724 | mul_add_c(a[7], b[5], c1, c2, c3); | |
725 | mul_add_c(a[6], b[6], c1, c2, c3); | |
726 | mul_add_c(a[5], b[7], c1, c2, c3); | |
727 | r[12] = c1; | |
728 | c1 = 0; | |
729 | mul_add_c(a[6], b[7], c2, c3, c1); | |
730 | mul_add_c(a[7], b[6], c2, c3, c1); | |
731 | r[13] = c2; | |
732 | c2 = 0; | |
733 | mul_add_c(a[7], b[7], c3, c1, c2); | |
734 | r[14] = c3; | |
735 | r[15] = c1; | |
736 | } | |
dfeab068 | 737 | |
6b691a5c | 738 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
0f113f3e MC |
739 | { |
740 | BN_ULONG c1, c2, c3; | |
741 | ||
742 | c1 = 0; | |
743 | c2 = 0; | |
744 | c3 = 0; | |
745 | mul_add_c(a[0], b[0], c1, c2, c3); | |
746 | r[0] = c1; | |
747 | c1 = 0; | |
748 | mul_add_c(a[0], b[1], c2, c3, c1); | |
749 | mul_add_c(a[1], b[0], c2, c3, c1); | |
750 | r[1] = c2; | |
751 | c2 = 0; | |
752 | mul_add_c(a[2], b[0], c3, c1, c2); | |
753 | mul_add_c(a[1], b[1], c3, c1, c2); | |
754 | mul_add_c(a[0], b[2], c3, c1, c2); | |
755 | r[2] = c3; | |
756 | c3 = 0; | |
757 | mul_add_c(a[0], b[3], c1, c2, c3); | |
758 | mul_add_c(a[1], b[2], c1, c2, c3); | |
759 | mul_add_c(a[2], b[1], c1, c2, c3); | |
760 | mul_add_c(a[3], b[0], c1, c2, c3); | |
761 | r[3] = c1; | |
762 | c1 = 0; | |
763 | mul_add_c(a[3], b[1], c2, c3, c1); | |
764 | mul_add_c(a[2], b[2], c2, c3, c1); | |
765 | mul_add_c(a[1], b[3], c2, c3, c1); | |
766 | r[4] = c2; | |
767 | c2 = 0; | |
768 | mul_add_c(a[2], b[3], c3, c1, c2); | |
769 | mul_add_c(a[3], b[2], c3, c1, c2); | |
770 | r[5] = c3; | |
771 | c3 = 0; | |
772 | mul_add_c(a[3], b[3], c1, c2, c3); | |
773 | r[6] = c1; | |
774 | r[7] = c2; | |
775 | } | |
dfeab068 | 776 | |
cbd48ba6 | 777 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
0f113f3e MC |
778 | { |
779 | BN_ULONG c1, c2, c3; | |
780 | ||
781 | c1 = 0; | |
782 | c2 = 0; | |
783 | c3 = 0; | |
784 | sqr_add_c(a, 0, c1, c2, c3); | |
785 | r[0] = c1; | |
786 | c1 = 0; | |
787 | sqr_add_c2(a, 1, 0, c2, c3, c1); | |
788 | r[1] = c2; | |
789 | c2 = 0; | |
790 | sqr_add_c(a, 1, c3, c1, c2); | |
791 | sqr_add_c2(a, 2, 0, c3, c1, c2); | |
792 | r[2] = c3; | |
793 | c3 = 0; | |
794 | sqr_add_c2(a, 3, 0, c1, c2, c3); | |
795 | sqr_add_c2(a, 2, 1, c1, c2, c3); | |
796 | r[3] = c1; | |
797 | c1 = 0; | |
798 | sqr_add_c(a, 2, c2, c3, c1); | |
799 | sqr_add_c2(a, 3, 1, c2, c3, c1); | |
800 | sqr_add_c2(a, 4, 0, c2, c3, c1); | |
801 | r[4] = c2; | |
802 | c2 = 0; | |
803 | sqr_add_c2(a, 5, 0, c3, c1, c2); | |
804 | sqr_add_c2(a, 4, 1, c3, c1, c2); | |
805 | sqr_add_c2(a, 3, 2, c3, c1, c2); | |
806 | r[5] = c3; | |
807 | c3 = 0; | |
808 | sqr_add_c(a, 3, c1, c2, c3); | |
809 | sqr_add_c2(a, 4, 2, c1, c2, c3); | |
810 | sqr_add_c2(a, 5, 1, c1, c2, c3); | |
811 | sqr_add_c2(a, 6, 0, c1, c2, c3); | |
812 | r[6] = c1; | |
813 | c1 = 0; | |
814 | sqr_add_c2(a, 7, 0, c2, c3, c1); | |
815 | sqr_add_c2(a, 6, 1, c2, c3, c1); | |
816 | sqr_add_c2(a, 5, 2, c2, c3, c1); | |
817 | sqr_add_c2(a, 4, 3, c2, c3, c1); | |
818 | r[7] = c2; | |
819 | c2 = 0; | |
820 | sqr_add_c(a, 4, c3, c1, c2); | |
821 | sqr_add_c2(a, 5, 3, c3, c1, c2); | |
822 | sqr_add_c2(a, 6, 2, c3, c1, c2); | |
823 | sqr_add_c2(a, 7, 1, c3, c1, c2); | |
824 | r[8] = c3; | |
825 | c3 = 0; | |
826 | sqr_add_c2(a, 7, 2, c1, c2, c3); | |
827 | sqr_add_c2(a, 6, 3, c1, c2, c3); | |
828 | sqr_add_c2(a, 5, 4, c1, c2, c3); | |
829 | r[9] = c1; | |
830 | c1 = 0; | |
831 | sqr_add_c(a, 5, c2, c3, c1); | |
832 | sqr_add_c2(a, 6, 4, c2, c3, c1); | |
833 | sqr_add_c2(a, 7, 3, c2, c3, c1); | |
834 | r[10] = c2; | |
835 | c2 = 0; | |
836 | sqr_add_c2(a, 7, 4, c3, c1, c2); | |
837 | sqr_add_c2(a, 6, 5, c3, c1, c2); | |
838 | r[11] = c3; | |
839 | c3 = 0; | |
840 | sqr_add_c(a, 6, c1, c2, c3); | |
841 | sqr_add_c2(a, 7, 5, c1, c2, c3); | |
842 | r[12] = c1; | |
843 | c1 = 0; | |
844 | sqr_add_c2(a, 7, 6, c2, c3, c1); | |
845 | r[13] = c2; | |
846 | c2 = 0; | |
847 | sqr_add_c(a, 7, c3, c1, c2); | |
848 | r[14] = c3; | |
849 | r[15] = c1; | |
850 | } | |
dfeab068 | 851 | |
cbd48ba6 | 852 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
0f113f3e MC |
853 | { |
854 | BN_ULONG c1, c2, c3; | |
855 | ||
856 | c1 = 0; | |
857 | c2 = 0; | |
858 | c3 = 0; | |
859 | sqr_add_c(a, 0, c1, c2, c3); | |
860 | r[0] = c1; | |
861 | c1 = 0; | |
862 | sqr_add_c2(a, 1, 0, c2, c3, c1); | |
863 | r[1] = c2; | |
864 | c2 = 0; | |
865 | sqr_add_c(a, 1, c3, c1, c2); | |
866 | sqr_add_c2(a, 2, 0, c3, c1, c2); | |
867 | r[2] = c3; | |
868 | c3 = 0; | |
869 | sqr_add_c2(a, 3, 0, c1, c2, c3); | |
870 | sqr_add_c2(a, 2, 1, c1, c2, c3); | |
871 | r[3] = c1; | |
872 | c1 = 0; | |
873 | sqr_add_c(a, 2, c2, c3, c1); | |
874 | sqr_add_c2(a, 3, 1, c2, c3, c1); | |
875 | r[4] = c2; | |
876 | c2 = 0; | |
877 | sqr_add_c2(a, 3, 2, c3, c1, c2); | |
878 | r[5] = c3; | |
879 | c3 = 0; | |
880 | sqr_add_c(a, 3, c1, c2, c3); | |
881 | r[6] = c1; | |
882 | r[7] = c2; | |
883 | } | |
884 | ||
885 | # ifdef OPENSSL_NO_ASM | |
886 | # ifdef OPENSSL_BN_ASM_MONT | |
887 | # include <alloca.h> | |
e7382805 AP |
888 | /* |
889 | * This is essentially reference implementation, which may or may not | |
ca04d7a2 AP |
890 | * result in performance improvement. E.g. on IA-32 this routine was |
891 | * observed to give 40% faster rsa1024 private key operations and 10% | |
892 | * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only | |
893 | * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a | |
df94f187 AP |
894 | * reference implementation, one to be used as starting point for |
895 | * platform-specific assembler. Mentioned numbers apply to compiler | |
896 | * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and | |
897 | * can vary not only from platform to platform, but even for compiler | |
898 | * versions. Assembler vs. assembler improvement coefficients can | |
899 | * [and are known to] differ and are to be documented elsewhere. | |
e7382805 | 900 | */ |
0f113f3e MC |
901 | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
902 | const BN_ULONG *np, const BN_ULONG *n0p, int num) | |
903 | { | |
904 | BN_ULONG c0, c1, ml, *tp, n0; | |
905 | # ifdef mul64 | |
906 | BN_ULONG mh; | |
907 | # endif | |
908 | volatile BN_ULONG *vp; | |
909 | int i = 0, j; | |
910 | ||
911 | # if 0 /* template for platform-specific | |
912 | * implementation */ | |
913 | if (ap == bp) | |
914 | return bn_sqr_mont(rp, ap, np, n0p, num); | |
915 | # endif | |
916 | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); | |
917 | ||
918 | n0 = *n0p; | |
919 | ||
920 | c0 = 0; | |
921 | ml = bp[0]; | |
922 | # ifdef mul64 | |
923 | mh = HBITS(ml); | |
924 | ml = LBITS(ml); | |
925 | for (j = 0; j < num; ++j) | |
926 | mul(tp[j], ap[j], ml, mh, c0); | |
927 | # else | |
928 | for (j = 0; j < num; ++j) | |
929 | mul(tp[j], ap[j], ml, c0); | |
930 | # endif | |
931 | ||
932 | tp[num] = c0; | |
933 | tp[num + 1] = 0; | |
934 | goto enter; | |
935 | ||
936 | for (i = 0; i < num; i++) { | |
937 | c0 = 0; | |
938 | ml = bp[i]; | |
939 | # ifdef mul64 | |
940 | mh = HBITS(ml); | |
941 | ml = LBITS(ml); | |
942 | for (j = 0; j < num; ++j) | |
943 | mul_add(tp[j], ap[j], ml, mh, c0); | |
944 | # else | |
945 | for (j = 0; j < num; ++j) | |
946 | mul_add(tp[j], ap[j], ml, c0); | |
947 | # endif | |
948 | c1 = (tp[num] + c0) & BN_MASK2; | |
949 | tp[num] = c1; | |
950 | tp[num + 1] = (c1 < c0 ? 1 : 0); | |
951 | enter: | |
952 | c1 = tp[0]; | |
953 | ml = (c1 * n0) & BN_MASK2; | |
954 | c0 = 0; | |
955 | # ifdef mul64 | |
956 | mh = HBITS(ml); | |
957 | ml = LBITS(ml); | |
958 | mul_add(c1, np[0], ml, mh, c0); | |
959 | # else | |
960 | mul_add(c1, ml, np[0], c0); | |
961 | # endif | |
962 | for (j = 1; j < num; j++) { | |
963 | c1 = tp[j]; | |
964 | # ifdef mul64 | |
965 | mul_add(c1, np[j], ml, mh, c0); | |
966 | # else | |
967 | mul_add(c1, ml, np[j], c0); | |
968 | # endif | |
969 | tp[j - 1] = c1 & BN_MASK2; | |
970 | } | |
971 | c1 = (tp[num] + c0) & BN_MASK2; | |
972 | tp[num - 1] = c1; | |
973 | tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); | |
974 | } | |
975 | ||
976 | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { | |
977 | c0 = bn_sub_words(rp, tp, np, num); | |
978 | if (tp[num] != 0 || c0 == 0) { | |
979 | for (i = 0; i < num + 2; i++) | |
980 | vp[i] = 0; | |
981 | return 1; | |
982 | } | |
983 | } | |
984 | for (i = 0; i < num; i++) | |
985 | rp[i] = tp[i], vp[i] = 0; | |
986 | vp[num] = 0; | |
987 | vp[num + 1] = 0; | |
988 | return 1; | |
989 | } | |
990 | # else | |
ca04d7a2 AP |
991 | /* |
992 | * Return value of 0 indicates that multiplication/convolution was not | |
993 | * performed to signal the caller to fall down to alternative/original | |
994 | * code-path. | |
995 | */ | |
0f113f3e MC |
996 | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
997 | const BN_ULONG *np, const BN_ULONG *n0, int num) | |
998 | { | |
999 | return 0; | |
1000 | } | |
1001 | # endif /* OPENSSL_BN_ASM_MONT */ | |
1002 | # endif | |
e7382805 | 1003 | |
0f113f3e | 1004 | #else /* !BN_MUL_COMBA */ |
dfeab068 RE |
1005 | |
1006 | /* hmm... is it faster just to do a multiply? */ | |
0f113f3e MC |
1007 | # undef bn_sqr_comba4 |
1008 | # undef bn_sqr_comba8 | |
e7382805 | 1009 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
0f113f3e MC |
1010 | { |
1011 | BN_ULONG t[8]; | |
1012 | bn_sqr_normal(r, a, 4, t); | |
1013 | } | |
dfeab068 | 1014 | |
e7382805 | 1015 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
0f113f3e MC |
1016 | { |
1017 | BN_ULONG t[16]; | |
1018 | bn_sqr_normal(r, a, 8, t); | |
1019 | } | |
dfeab068 | 1020 | |
6b691a5c | 1021 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
0f113f3e MC |
1022 | { |
1023 | r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); | |
1024 | r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); | |
1025 | r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); | |
1026 | r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); | |
1027 | } | |
dfeab068 | 1028 | |
6b691a5c | 1029 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
0f113f3e MC |
1030 | { |
1031 | r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); | |
1032 | r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); | |
1033 | r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); | |
1034 | r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); | |
1035 | r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); | |
1036 | r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); | |
1037 | r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); | |
1038 | r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); | |
1039 | } | |
1040 | ||
1041 | # ifdef OPENSSL_NO_ASM | |
1042 | # ifdef OPENSSL_BN_ASM_MONT | |
1043 | # include <alloca.h> | |
1044 | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, | |
1045 | const BN_ULONG *np, const BN_ULONG *n0p, int num) | |
1046 | { | |
1047 | BN_ULONG c0, c1, *tp, n0 = *n0p; | |
1048 | volatile BN_ULONG *vp; | |
1049 | int i = 0, j; | |
1050 | ||
1051 | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); | |
1052 | ||
1053 | for (i = 0; i <= num; i++) | |
1054 | tp[i] = 0; | |
1055 | ||
1056 | for (i = 0; i < num; i++) { | |
1057 | c0 = bn_mul_add_words(tp, ap, num, bp[i]); | |
1058 | c1 = (tp[num] + c0) & BN_MASK2; | |
1059 | tp[num] = c1; | |
1060 | tp[num + 1] = (c1 < c0 ? 1 : 0); | |
1061 | ||
1062 | c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); | |
1063 | c1 = (tp[num] + c0) & BN_MASK2; | |
1064 | tp[num] = c1; | |
1065 | tp[num + 1] += (c1 < c0 ? 1 : 0); | |
1066 | for (j = 0; j <= num; j++) | |
1067 | tp[j] = tp[j + 1]; | |
1068 | } | |
1069 | ||
1070 | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { | |
1071 | c0 = bn_sub_words(rp, tp, np, num); | |
1072 | if (tp[num] != 0 || c0 == 0) { | |
1073 | for (i = 0; i < num + 2; i++) | |
1074 | vp[i] = 0; | |
1075 | return 1; | |
1076 | } | |
1077 | } | |
1078 | for (i = 0; i < num; i++) | |
1079 | rp[i] = tp[i], vp[i] = 0; | |
1080 | vp[num] = 0; | |
1081 | vp[num + 1] = 0; | |
1082 | return 1; | |
1083 | } | |
1084 | # else | |
1085 | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, | |
1086 | const BN_ULONG *np, const BN_ULONG *n0, int num) | |
1087 | { | |
1088 | return 0; | |
1089 | } | |
1090 | # endif /* OPENSSL_BN_ASM_MONT */ | |
1091 | # endif | |
1092 | ||
1093 | #endif /* !BN_MUL_COMBA */ |