]>
git.ipfire.org Git - thirdparty/freeswitch.git/blob - libs/libzrtp/third_party/bnlib/lbn64.c
2 * Copyright (c) 1995 Colin Plumb. All rights reserved.
3 * For licensing and other legal details, see the file legal.c.
5 * lbn64.c - Low-level bignum routines, 64-bit version.
7 * NOTE: the magic constants "64" and "128" appear in many places in this
8 * file, including inside identifiers. Because it is not possible to
9 * ask "#ifdef" of a macro expansion, it is not possible to use the
10 * preprocessor to conditionalize these properly. Thus, this file is
11 * intended to be edited with textual search and replace to produce
12 * alternate word size versions. Any reference to the number of bits
13 * in a word must be the string "64", and that string must not appear
14 * otherwise. Any reference to twice this number must appear as "128",
15 * which likewise must not appear otherwise. Is that clear?
17 * Remember, when doubling the bit size replace the larger number (128)
18 * first, then the smaller (64). When halving the bit size, do the
19 * opposite. Otherwise, things will get wierd. Also, be sure to replace
20 * every instance that appears. (:%s/foo/bar/g in vi)
22 * These routines work with a pointer to the least-significant end of
23 * an array of WORD64s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
24 * defined in lbn.h (which expand to x on a big-edian machine and y on a
25 * little-endian machine) are used to conditionalize the code to work
26 * either way. If you have no assembly primitives, it doesn't matter.
27 * Note that on a big-endian machine, the least-significant-end pointer
28 * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
29 * On little-endian, they are ptr[0] through ptr[len-1]. This makes
30 * perfect sense if you consider pointers to point *between* bytes rather
33 * Because the array index values are unsigned integers, ptr[-i]
34 * may not work properly, since the index -i is evaluated as an unsigned,
35 * and if pointers are wider, zero-extension will produce a positive
36 * number rahter than the needed negative. The expression used in this
37 * code, *(ptr-i) will, however, work. (The array syntax is equivalent
38 * to *(ptr+-i), which is a pretty subtle difference.)
40 * Many of these routines will get very unhappy if fed zero-length inputs.
41 * They use assert() to enforce this. An higher layer of code must make
42 * sure that these aren't called with zero-length inputs.
44 * Any of these routines can be replaced with more efficient versions
45 * elsewhere, by just #defining their names. If one of the names
46 * is #defined, the C code is not compiled in and no declaration is
47 * made. Use the BNINCLUDE file to do that. Typically, you compile
48 * asm subroutines with the same name and just, e.g.
49 * #define lbnMulAdd1_64 lbnMulAdd1_64
51 * If you want to write asm routines, start with lbnMulAdd1_64().
52 * This is the workhorse of modular exponentiation. lbnMulN1_64() is
53 * also used a fair bit, although not as much and it's defined in terms
54 * of lbnMulAdd1_64 if that has a custom version. lbnMulSub1_64 and
55 * lbnDiv21_64 are used in the usual division and remainder finding.
56 * (Not the Montgomery reduction used in modular exponentiation, though.)
57 * Once you have lbnMulAdd1_64 defined, writing the other two should
58 * be pretty easy. (Just make sure you get the sign of the subtraction
59 * in lbnMulSub1_64 right - it's dest = dest - source * k.)
61 * The only definitions that absolutely need a double-word (BNWORD128)
62 * type are lbnMulAdd1_64 and lbnMulSub1_64; if those are provided,
63 * the rest follows. lbnDiv21_64, however, is a lot slower unless you
64 * have them, and lbnModQ_64 takes after it. That one is used quite a
65 * bit for prime sieving.
69 #define HAVE_CONFIG_H 0
76 * Some compilers complain about #if FOO if FOO isn't defined,
77 * so do the ANSI-mandated thing explicitly...
85 #ifndef HAVE_STRINGS_H
86 #define HAVE_STRINGS_H 0
92 #define assert(x) (void)0
96 #include <string.h> /* For memcpy */
108 #error 64-bit bignum library requires a 64-bit data type
111 /* If this is defined, include bnYield() calls */
113 extern int (*bnYield
)(void); /* From bn.c */
117 * Most of the multiply (and Montgomery reduce) routines use an outer
118 * loop that iterates over one of the operands - a so-called operand
119 * scanning approach. One big advantage of this is that the assembly
120 * support routines are simpler. The loops can be rearranged to have
121 * an outer loop that iterates over the product, a so-called product
122 * scanning approach. This has the advantage of writing less data
123 * and doing fewer adds to memory, so is supposedly faster. Some
124 * code has been written using a product-scanning approach, but
125 * it appears to be slower, so it is turned off by default. Some
126 * experimentation would be appreciated.
128 * (The code is also annoying to get right and not very well commented,
129 * one of my pet peeves about math libraries. I'm sorry.)
132 #define PRODUCT_SCAN 0
136 * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
137 * This is a good example of how the byte offsets and BIGLITTLE() macros work.
138 * Another alternative would have been
139 * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD64)), but I find that
140 * putting operators into conditional macros is confusing.
144 lbnCopy_64(BNWORD64
*dest
, BNWORD64
const *src
, unsigned len
)
146 memcpy(BIGLITTLE(dest
-len
,dest
), BIGLITTLE(src
-len
,src
),
149 #endif /* !lbnCopy_64 */
152 * Fill n words with zero. This does it manually rather than calling
153 * memset because it can assume alignment to make things faster while
154 * memset can't. Note how big-endian numbers are naturally addressed
155 * using predecrement, while little-endian is postincrement.
159 lbnZero_64(BNWORD64
*num
, unsigned len
)
162 BIGLITTLE(*--num
,*num
++) = 0;
164 #endif /* !lbnZero_64 */
167 * Negate an array of words.
168 * Negation is subtraction from zero. Negating low-order words
169 * entails doing nothing until a non-zero word is hit. Once that
170 * is negated, a borrow is generated and never dies until the end
171 * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
172 * Repeat that until the end of the number.
174 * Doesn't return borrow out because that's pretty useless - it's
175 * always set unless the input is 0, which is easy to notice in
180 lbnNeg_64(BNWORD64
*num
, unsigned len
)
184 /* Skip low-order zero words */
185 while (BIGLITTLE(*--num
,*num
) == 0) {
190 /* Negate the lowest-order non-zero word */
192 /* Complement all the higher-order words */
194 BIGLITTLE(--num
,++num
);
198 #endif /* !lbnNeg_64 */
202 * lbnAdd1_64: add the single-word "carry" to the given number.
203 * Used for minor increments and propagating the carry after
204 * adding in a shorter bignum.
206 * Technique: If we have a double-width word, presumably the compiler
207 * can add using its carry in inline code, so we just use a larger
208 * accumulator to compute the carry from the first addition.
209 * If not, it's more complex. After adding the first carry, which may
210 * be > 1, compare the sum and the carry. If the sum wraps (causing a
211 * carry out from the addition), the result will be less than each of the
212 * inputs, since the wrap subtracts a number (2^64) which is larger than
213 * the other input can possibly be. If the sum is >= the carry input,
214 * return success immediately.
215 * In either case, if there is a carry, enter a loop incrementing words
216 * until one does not wrap. Since we are adding 1 each time, the wrap
217 * will be to 0 and we can test for equality.
219 #ifndef lbnAdd1_64 /* If defined, it's provided as an asm subroutine */
222 lbnAdd1_64(BNWORD64
*num
, unsigned len
, BNWORD64 carry
)
225 assert(len
> 0); /* Alternative: if (!len) return carry */
227 t
= (BNWORD128
)BIGLITTLE(*--num
,*num
) + carry
;
228 BIGLITTLE(*num
,*num
++) = (BNWORD64
)t
;
232 if (++BIGLITTLE(*--num
,*num
++) != 0)
237 #else /* no BNWORD128 */
239 lbnAdd1_64(BNWORD64
*num
, unsigned len
, BNWORD64 carry
)
241 assert(len
> 0); /* Alternative: if (!len) return carry */
243 if ((BIGLITTLE(*--num
,*num
++) += carry
) >= carry
)
246 if (++BIGLITTLE(*--num
,*num
++) != 0)
252 #endif/* !lbnAdd1_64 */
255 * lbnSub1_64: subtract the single-word "borrow" from the given number.
256 * Used for minor decrements and propagating the borrow after
257 * subtracting a shorter bignum.
259 * Technique: Similar to the add, above. If there is a double-length type,
260 * use that to generate the first borrow.
261 * If not, after subtracting the first borrow, which may be > 1, compare
262 * the difference and the *negative* of the carry. If the subtract wraps
263 * (causing a borrow out from the subtraction), the result will be at least
264 * as large as -borrow. If the result < -borrow, then no borrow out has
265 * appeared and we may return immediately, except when borrow == 0. To
266 * deal with that case, use the identity that -x = ~x+1, and instead of
267 * comparing < -borrow, compare for <= ~borrow.
268 * Either way, if there is a borrow out, enter a loop decrementing words
269 * until a non-zero word is reached.
271 * Note the cast of ~borrow to (BNWORD64). If the size of an int is larger
272 * than BNWORD64, C rules say the number is expanded for the arithmetic, so
273 * the inversion will be done on an int and the value won't be quite what
276 #ifndef lbnSub1_64 /* If defined, it's provided as an asm subroutine */
279 lbnSub1_64(BNWORD64
*num
, unsigned len
, BNWORD64 borrow
)
282 assert(len
> 0); /* Alternative: if (!len) return borrow */
284 t
= (BNWORD128
)BIGLITTLE(*--num
,*num
) - borrow
;
285 BIGLITTLE(*num
,*num
++) = (BNWORD64
)t
;
289 if ((BIGLITTLE(*--num
,*num
++))-- != 0)
294 #else /* no BNWORD128 */
296 lbnSub1_64(BNWORD64
*num
, unsigned len
, BNWORD64 borrow
)
298 assert(len
> 0); /* Alternative: if (!len) return borrow */
300 if ((BIGLITTLE(*--num
,*num
++) -= borrow
) <= (BNWORD64
)~borrow
)
303 if ((BIGLITTLE(*--num
,*num
++))-- != 0)
309 #endif /* !lbnSub1_64 */
312 * lbnAddN_64: add two bignums of the same length, returning the carry (0 or 1).
313 * One of the building blocks, along with lbnAdd1, of adding two bignums of
316 * Technique: Maintain a word of carry. If there is no double-width type,
317 * use the same technique as in lbnAdd1, above, to maintain the carry by
318 * comparing the inputs. Adding the carry sources is used as an OR operator;
319 * at most one of the two comparisons can possibly be true. The first can
320 * only be true if carry == 1 and x, the result, is 0. In that case the
321 * second can't possibly be true.
326 lbnAddN_64(BNWORD64
*num1
, BNWORD64
const *num2
, unsigned len
)
332 t
= (BNWORD128
)BIGLITTLE(*--num1
,*num1
) + BIGLITTLE(*--num2
,*num2
++);
333 BIGLITTLE(*num1
,*num1
++) = (BNWORD64
)t
;
335 t
= (BNWORD128
)BIGLITTLE(*--num1
,*num1
) +
336 (BNWORD128
)BIGLITTLE(*--num2
,*num2
++) + (t
>> 64);
337 BIGLITTLE(*num1
,*num1
++) = (BNWORD64
)t
;
340 return (BNWORD64
)(t
>>64);
342 #else /* no BNWORD128 */
344 lbnAddN_64(BNWORD64
*num1
, BNWORD64
const *num2
, unsigned len
)
346 BNWORD64 x
, carry
= 0;
348 assert(len
> 0); /* Alternative: change loop to test at start */
351 x
= BIGLITTLE(*--num2
,*num2
++);
352 carry
= (x
+= carry
) < carry
;
353 carry
+= (BIGLITTLE(*--num1
,*num1
++) += x
) < x
;
359 #endif /* !lbnAddN_64 */
362 * lbnSubN_64: add two bignums of the same length, returning the carry (0 or 1).
363 * One of the building blocks, along with subn1, of subtracting two bignums of
366 * Technique: If no double-width type is availble, maintain a word of borrow.
367 * First, add the borrow to the subtrahend (did you have to learn all those
368 * awful words in elementary school, too?), and if it overflows, set the
369 * borrow again. Then subtract the modified subtrahend from the next word
370 * of input, using the same technique as in subn1, above.
371 * Adding the borrows is used as an OR operator; at most one of the two
372 * comparisons can possibly be true. The first can only be true if
373 * borrow == 1 and x, the result, is 0. In that case the second can't
376 * In the double-word case, (BNWORD64)-(t>>64) is subtracted, rather than
377 * adding t>>64, because the shift would need to sign-extend and that's
378 * not guaranteed to happen in ANSI C, even with signed types.
383 lbnSubN_64(BNWORD64
*num1
, BNWORD64
const *num2
, unsigned len
)
389 t
= (BNWORD128
)BIGLITTLE(*--num1
,*num1
) - BIGLITTLE(*--num2
,*num2
++);
390 BIGLITTLE(*num1
,*num1
++) = (BNWORD64
)t
;
393 t
= (BNWORD128
)BIGLITTLE(*--num1
,*num1
) -
394 (BNWORD128
)BIGLITTLE(*--num2
,*num2
++) - (BNWORD64
)-(t
>> 64);
395 BIGLITTLE(*num1
,*num1
++) = (BNWORD64
)t
;
398 return -(BNWORD64
)(t
>>64);
402 lbnSubN_64(BNWORD64
*num1
, BNWORD64
const *num2
, unsigned len
)
404 BNWORD64 x
, borrow
= 0;
406 assert(len
> 0); /* Alternative: change loop to test at start */
409 x
= BIGLITTLE(*--num2
,*num2
++);
410 borrow
= (x
+= borrow
) < borrow
;
411 borrow
+= (BIGLITTLE(*--num1
,*num1
++) -= x
) > (BNWORD64
)~x
;
417 #endif /* !lbnSubN_64 */
421 * lbnCmp_64: compare two bignums of equal length, returning the sign of
422 * num1 - num2. (-1, 0 or +1).
424 * Technique: Change the little-endian pointers to big-endian pointers
425 * and compare from the most-significant end until a difference if found.
426 * When it is, figure out the sign of the difference and return it.
429 lbnCmp_64(BNWORD64
const *num1
, BNWORD64
const *num2
, unsigned len
)
431 BIGLITTLE(num1
-= len
, num1
+= len
);
432 BIGLITTLE(num2
-= len
, num2
+= len
);
435 if (BIGLITTLE(*num1
++ != *num2
++, *--num1
!= *--num2
)) {
436 if (BIGLITTLE(num1
[-1] < num2
[-1], *num1
< *num2
))
444 #endif /* !lbnCmp_64 */
447 * mul64_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
448 * computes (ph,pl) = x * y + a + b. mul64_ppmma and mul64_ppmm
449 * are simpler versions. If you want to be lazy, all of these
450 * can be defined in terms of the others, so here we create any
451 * that have not been defined in terms of the ones that have been.
454 /* Define ones with fewer a's in terms of ones with more a's */
455 #if !defined(mul64_ppmma) && defined(mul64_ppmmaa)
456 #define mul64_ppmma(ph,pl,x,y,a) mul64_ppmmaa(ph,pl,x,y,a,0)
459 #if !defined(mul64_ppmm) && defined(mul64_ppmma)
460 #define mul64_ppmm(ph,pl,x,y) mul64_ppmma(ph,pl,x,y,0)
464 * Use this definition to test the mul64_ppmm-based operations on machines
465 * that do not provide mul64_ppmm. Change the final "0" to a "1" to
468 #if !defined(mul64_ppmm) && defined(BNWORD128) && 0 /* Debugging */
469 #define mul64_ppmm(ph,pl,x,y) \
470 ({BNWORD128 _ = (BNWORD128)(x)*(y); (pl) = _; (ph) = _>>64;})
473 #if defined(mul64_ppmm) && !defined(mul64_ppmma)
474 #define mul64_ppmma(ph,pl,x,y,a) \
475 (mul64_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
478 #if defined(mul64_ppmma) && !defined(mul64_ppmmaa)
479 #define mul64_ppmmaa(ph,pl,x,y,a,b) \
480 (mul64_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
484 * lbnMulN1_64: Multiply an n-word input by a 1-word input and store the
485 * n+1-word product. This uses either the mul64_ppmm and mul64_ppmma
486 * macros, or C multiplication with the BNWORD128 type. This uses mul64_ppmma
487 * if available, assuming you won't bother defining it unless you can do
488 * better than the normal multiplication.
491 #ifdef lbnMulAdd1_64 /* If we have this asm primitive, use it. */
493 lbnMulN1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
495 lbnZero_64(out
, len
);
496 BIGLITTLE(*(out
-len
-1),*(out
+len
)) = lbnMulAdd1_64(out
, in
, len
, k
);
498 #elif defined(mul64_ppmm)
500 lbnMulN1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
502 BNWORD64 carry
, carryin
;
507 mul64_ppmm(carry
, *out
, *in
, k
);
513 mul64_ppmma(carry
, *out
, *in
, k
, carryin
);
516 BIGLITTLE(*--out
,*out
) = carry
;
518 #elif defined(BNWORD128)
520 lbnMulN1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
526 p
= (BNWORD128
)BIGLITTLE(*--in
,*in
++) * k
;
527 BIGLITTLE(*--out
,*out
++) = (BNWORD64
)p
;
530 p
= (BNWORD128
)BIGLITTLE(*--in
,*in
++) * k
+ (BNWORD64
)(p
>> 64);
531 BIGLITTLE(*--out
,*out
++) = (BNWORD64
)p
;
533 BIGLITTLE(*--out
,*out
) = (BNWORD64
)(p
>> 64);
536 #error No 64x64 -> 128 multiply available for 64-bit bignum package
538 #endif /* lbnMulN1_64 */
541 * lbnMulAdd1_64: Multiply an n-word input by a 1-word input and add the
542 * low n words of the product to the destination. *Returns the n+1st word
543 * of the product.* (That turns out to be more convenient than adding
544 * it into the destination and dealing with a possible unit carry out
545 * of *that*.) This uses either the mul64_ppmma and mul64_ppmmaa macros,
546 * or C multiplication with the BNWORD128 type.
548 * If you're going to write assembly primitives, this is the one to
549 * start with. It is by far the most commonly called function.
551 #ifndef lbnMulAdd1_64
552 #if defined(mul64_ppmm)
554 lbnMulAdd1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
556 BNWORD64 prod
, carry
, carryin
;
562 mul64_ppmma(carry
, *out
, *in
, k
, carryin
);
568 mul64_ppmmaa(carry
, prod
, *in
, k
, carryin
, *out
);
575 #elif defined(BNWORD128)
577 lbnMulAdd1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
583 p
= (BNWORD128
)BIGLITTLE(*--in
,*in
++) * k
+ BIGLITTLE(*--out
,*out
);
584 BIGLITTLE(*out
,*out
++) = (BNWORD64
)p
;
587 p
= (BNWORD128
)BIGLITTLE(*--in
,*in
++) * k
+
588 (BNWORD64
)(p
>> 64) + BIGLITTLE(*--out
,*out
);
589 BIGLITTLE(*out
,*out
++) = (BNWORD64
)p
;
592 return (BNWORD64
)(p
>> 64);
595 #error No 64x64 -> 128 multiply available for 64-bit bignum package
597 #endif /* lbnMulAdd1_64 */
600 * lbnMulSub1_64: Multiply an n-word input by a 1-word input and subtract the
601 * n-word product from the destination. Returns the n+1st word of the product.
602 * This uses either the mul64_ppmm and mul64_ppmma macros, or
603 * C multiplication with the BNWORD128 type.
605 * This is rather uglier than adding, but fortunately it's only used in
606 * division which is not used too heavily.
608 #ifndef lbnMulSub1_64
609 #if defined(mul64_ppmm)
611 lbnMulSub1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
613 BNWORD64 prod
, carry
, carryin
;
618 mul64_ppmm(carry
, prod
, *in
, k
);
620 carry
+= (BIGLITTLE(*--out
,*out
++) -= prod
) > (BNWORD64
)~prod
;
625 mul64_ppmma(carry
, prod
, *in
, k
, carryin
);
627 carry
+= (BIGLITTLE(*--out
,*out
++) -= prod
) > (BNWORD64
)~prod
;
632 #elif defined(BNWORD128)
634 lbnMulSub1_64(BNWORD64
*out
, BNWORD64
const *in
, unsigned len
, BNWORD64 k
)
641 p
= (BNWORD128
)BIGLITTLE(*--in
,*in
++) * k
;
642 t
= BIGLITTLE(*--out
,*out
);
643 carry
= (BNWORD64
)(p
>>64) + ((BIGLITTLE(*out
,*out
++)=t
-(BNWORD64
)p
) > t
);
646 p
= (BNWORD128
)BIGLITTLE(*--in
,*in
++) * k
+ carry
;
647 t
= BIGLITTLE(*--out
,*out
);
648 carry
= (BNWORD64
)(p
>>64) +
649 ( (BIGLITTLE(*out
,*out
++)=t
-(BNWORD64
)p
) > t
);
655 #error No 64x64 -> 128 multiply available for 64-bit bignum package
657 #endif /* !lbnMulSub1_64 */
660 * Shift n words left "shift" bits. 0 < shift < 64. Returns the
661 * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
665 lbnLshift_64(BNWORD64
*num
, unsigned len
, unsigned shift
)
676 *num
= (x
<<shift
) | carry
;
678 carry
= x
>> (64-shift
);
682 #endif /* !lbnLshift_64 */
685 * An optimized version of the above, for shifts of 1.
686 * Some machines can use add-with-carry tricks for this.
690 lbnDouble_64(BNWORD64
*num
, unsigned len
)
698 *num
= (x
<<1) | carry
;
704 #endif /* !lbnDouble_64 */
707 * Shift n words right "shift" bits. 0 < shift < 64. Returns the
708 * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
712 lbnRshift_64(BNWORD64
*num
, unsigned len
, unsigned shift
)
714 BNWORD64 x
, carry
= 0;
719 BIGLITTLE(num
-= len
, num
+= len
);
724 *num
= (x
>>shift
) | carry
;
726 carry
= x
<< (64-shift
);
728 return carry
>> (64-shift
);
730 #endif /* !lbnRshift_64 */
733 * Multiply two numbers of the given lengths. prod and num2 may overlap,
734 * provided that the low len1 bits of prod are free. (This corresponds
735 * nicely to the place the result is returned from lbnMontReduce_64.)
737 * TODO: Use Karatsuba multiply. The overlap constraints may have
742 lbnMul_64(BNWORD64
*prod
, BNWORD64
const *num1
, unsigned len1
,
743 BNWORD64
const *num2
, unsigned len2
)
745 /* Special case of zero */
746 if (!len1
|| !len2
) {
747 lbnZero_64(prod
, len1
+len2
);
751 /* Multiply first word */
752 lbnMulN1_64(prod
, num1
, len1
, BIGLITTLE(*--num2
,*num2
++));
755 * Add in subsequent words, storing the most significant word,
756 * which is new each time.
759 BIGLITTLE(--prod
,prod
++);
760 BIGLITTLE(*(prod
-len1
-1),*(prod
+len1
)) =
761 lbnMulAdd1_64(prod
, num1
, len1
, BIGLITTLE(*--num2
,*num2
++));
764 #endif /* !lbnMul_64 */
767 * lbnMulX_64 is a square multiply - both inputs are the same length.
768 * It's normally just a macro wrapper around the general multiply,
769 * but might be implementable in assembly more efficiently (such as
770 * when product scanning).
773 #if defined(BNWORD128) && PRODUCT_SCAN
775 * Test code to see whether product scanning is any faster. It seems
776 * to make the C code slower, so PRODUCT_SCAN is not defined.
779 lbnMulX_64(BNWORD64
*prod
, BNWORD64
const *num1
, BNWORD64
const *num2
,
783 BNWORD64
const *p1
, *p2
;
787 /* Special case of zero */
791 x
= (BNWORD128
)BIGLITTLE(num1
[-1] * num2
[-1], num1
[0] * num2
[0]);
792 BIGLITTLE(*--prod
, *prod
++) = (BNWORD64
)x
;
795 for (i
= 1; i
< len
; i
++) {
798 p2
= BIGLITTLE(num2
-i
-1,num2
+i
+1);
799 for (j
= 0; j
<= i
; j
++) {
800 BIG(y
= (BNWORD128
)*--p1
* *p2
++;)
801 LITTLE(y
= (BNWORD128
)*p1
++ * *--p2
;)
805 BIGLITTLE(*--prod
,*prod
++) = (BNWORD64
)x
;
806 x
= (x
>> 64) | (BNWORD128
)carry
<< 64;
808 for (i
= 1; i
< len
; i
++) {
810 p1
= BIGLITTLE(num1
-i
,num1
+i
);
811 p2
= BIGLITTLE(num2
-len
,num2
+len
);
812 for (j
= i
; j
< len
; j
++) {
813 BIG(y
= (BNWORD128
)*--p1
* *p2
++;)
814 LITTLE(y
= (BNWORD128
)*p1
++ * *--p2
;)
818 BIGLITTLE(*--prod
,*prod
++) = (BNWORD64
)x
;
819 x
= (x
>> 64) | (BNWORD128
)carry
<< 64;
822 BIGLITTLE(*--prod
,*prod
) = (BNWORD64
)x
;
824 #else /* !defined(BNWORD128) || !PRODUCT_SCAN */
825 /* Default trivial macro definition */
826 #define lbnMulX_64(prod, num1, num2, len) lbnMul_64(prod, num1, len, num2, len)
827 #endif /* !defined(BNWORD128) || !PRODUCT_SCAN */
828 #endif /* !lbmMulX_64 */
830 #if !defined(lbnMontMul_64) && defined(BNWORD128) && PRODUCT_SCAN
832 * Test code for product-scanning multiply. This seems to slow the C
833 * code down rather than speed it up.
834 * This does a multiply and Montgomery reduction together, using the
835 * same loops. The outer loop scans across the product, twice.
836 * The first pass computes the low half of the product and the
837 * Montgomery multipliers. These are stored in the product array,
838 * which contains no data as of yet. x and carry add up the columns
839 * and propagate carries forward.
841 * The second half multiplies the upper half, adding in the modulus
842 * times the Montgomery multipliers. The results of this multiply
846 lbnMontMul_64(BNWORD64
*prod
, BNWORD64
const *num1
, BNWORD64
const *num2
,
847 BNWORD64
const *mod
, unsigned len
, BNWORD64 inv
)
850 BNWORD64
const *p1
, *p2
, *pm
;
856 /* Special case of zero */
861 * This computes directly into the high half of prod, so just
862 * shift the pointer and consider prod only "len" elements long
863 * for the rest of the code.
865 BIGLITTLE(prod
-= len
, prod
+= len
);
867 /* Pass 1 - compute Montgomery multipliers */
868 /* First iteration can have certain simplifications. */
869 x
= (BNWORD128
)BIGLITTLE(num1
[-1] * num2
[-1], num1
[0] * num2
[0]);
870 BIGLITTLE(prod
[-1], prod
[0]) = t
= inv
* (BNWORD64
)x
;
871 y
= (BNWORD128
)t
* BIGLITTLE(mod
[-1],mod
[0]);
873 /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
875 assert((BNWORD64
)x
== 0);
876 x
= x
>> 64 | (BNWORD128
)carry
<< 64;
878 for (i
= 1; i
< len
; i
++) {
881 p2
= BIGLITTLE(num2
-i
-1,num2
+i
+1);
883 pm
= BIGLITTLE(mod
-i
-1,mod
+i
+1);
884 for (j
= 0; j
< i
; j
++) {
885 y
= (BNWORD128
)BIGLITTLE(*--p1
* *p2
++, *p1
++ * *--p2
);
888 y
= (BNWORD128
)BIGLITTLE(*--pp
* *pm
++, *pp
++ * *--pm
);
892 y
= (BNWORD128
)BIGLITTLE(p1
[-1] * p2
[0], p1
[0] * p2
[-1]);
895 assert(BIGLITTLE(pp
== prod
-i
, pp
== prod
+i
));
896 BIGLITTLE(pp
[-1], pp
[0]) = t
= inv
* (BNWORD64
)x
;
897 assert(BIGLITTLE(pm
== mod
-1, pm
== mod
+1));
898 y
= (BNWORD128
)t
* BIGLITTLE(pm
[0],pm
[-1]);
901 assert((BNWORD64
)x
== 0);
902 x
= x
>> 64 | (BNWORD128
)carry
<< 64;
905 /* Pass 2 - compute reduced product and store */
906 for (i
= 1; i
< len
; i
++) {
908 p1
= BIGLITTLE(num1
-i
,num1
+i
);
909 p2
= BIGLITTLE(num2
-len
,num2
+len
);
910 pm
= BIGLITTLE(mod
-i
,mod
+i
);
911 pp
= BIGLITTLE(prod
-len
,prod
+len
);
912 for (j
= i
; j
< len
; j
++) {
913 y
= (BNWORD128
)BIGLITTLE(*--p1
* *p2
++, *p1
++ * *--p2
);
916 y
= (BNWORD128
)BIGLITTLE(*--pm
* *pp
++, *pm
++ * *--pp
);
920 assert(BIGLITTLE(pm
== mod
-len
, pm
== mod
+len
));
921 assert(BIGLITTLE(pp
== prod
-i
, pp
== prod
+i
));
922 BIGLITTLE(pp
[0],pp
[-1]) = (BNWORD64
)x
;
923 x
= (x
>> 64) | (BNWORD128
)carry
<< 64;
926 /* Last round of second half, simplified. */
927 BIGLITTLE(*(prod
-len
),*(prod
+len
-1)) = (BNWORD64
)x
;
931 carry
-= lbnSubN_64(prod
, mod
, len
);
932 while (lbnCmp_64(prod
, mod
, len
) >= 0)
933 (void)lbnSubN_64(prod
, mod
, len
);
935 /* Suppress later definition */
936 #define lbnMontMul_64 lbnMontMul_64
939 #if !defined(lbnSquare_64) && defined(BNWORD128) && PRODUCT_SCAN
941 * Trial code for product-scanning squaring. This seems to slow the C
942 * code down rather than speed it up.
945 lbnSquare_64(BNWORD64
*prod
, BNWORD64
const *num
, unsigned len
)
948 BNWORD64
const *p1
, *p2
;
952 /* Special case of zero */
956 /* Word 0 of product */
957 x
= (BNWORD128
)BIGLITTLE(num
[-1] * num
[-1], num
[0] * num
[0]);
958 BIGLITTLE(*--prod
, *prod
++) = (BNWORD64
)x
;
961 /* Words 1 through len-1 */
962 for (i
= 1; i
< len
; i
++) {
966 p2
= BIGLITTLE(num
-i
-1,num
+i
+1);
967 for (j
= 0; j
< (i
+1)/2; j
++) {
968 BIG(z
= (BNWORD128
)*--p1
* *p2
++;)
969 LITTLE(z
= (BNWORD128
)*p1
++ * *--p2
;)
974 carry
+= carry
+ (y
< z
);
976 assert(BIGLITTLE(--p1
== p2
, p1
== --p2
));
977 BIG(z
= (BNWORD128
)*p2
* *p2
;)
978 LITTLE(z
= (BNWORD128
)*p1
* *p1
;)
984 BIGLITTLE(*--prod
,*prod
++) = (BNWORD64
)x
;
985 x
= (x
>> 64) | (BNWORD128
)carry
<< 64;
987 /* Words len through 2*len-2 */
988 for (i
= 1; i
< len
; i
++) {
991 p1
= BIGLITTLE(num
-i
,num
+i
);
992 p2
= BIGLITTLE(num
-len
,num
+len
);
993 for (j
= 0; j
< (len
-i
)/2; j
++) {
994 BIG(z
= (BNWORD128
)*--p1
* *p2
++;)
995 LITTLE(z
= (BNWORD128
)*p1
++ * *--p2
;)
1000 carry
+= carry
+ (y
< z
);
1002 assert(BIGLITTLE(--p1
== p2
, p1
== --p2
));
1003 BIG(z
= (BNWORD128
)*p2
* *p2
;)
1004 LITTLE(z
= (BNWORD128
)*p1
* *p1
;)
1010 BIGLITTLE(*--prod
,*prod
++) = (BNWORD64
)x
;
1011 x
= (x
>> 64) | (BNWORD128
)carry
<< 64;
1015 BIGLITTLE(*--prod
,*prod
) = (BNWORD64
)x
;
1017 /* Suppress later definition */
1018 #define lbnSquare_64 lbnSquare_64
1022 * Square a number, using optimized squaring to reduce the number of
1023 * primitive multiples that are executed. There may not be any
1024 * overlap of the input and output.
1026 * Technique: Consider the partial products in the multiplication
1027 * of "abcde" by itself:
1031 * ==================
1038 * Note that everything above the main diagonal:
1039 * ae be ce de = (abcd) * e
1040 * ad bd cd = (abc) * d
1044 * is a copy of everything below the main diagonal:
1050 * Thus, the sum is 2 * (off the diagonal) + diagonal.
1052 * This is accumulated beginning with the diagonal (which
1053 * consist of the squares of the digits of the input), which is then
1054 * divided by two, the off-diagonal added, and multiplied by two
1055 * again. The low bit is simply a copy of the low bit of the
1056 * input, so it doesn't need special care.
1058 * TODO: Merge the shift by 1 with the squaring loop.
1059 * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
1061 #ifndef lbnSquare_64
1063 lbnSquare_64(BNWORD64
*prod
, BNWORD64
const *num
, unsigned len
)
1066 BNWORD64
*prodx
= prod
; /* Working copy of the argument */
1067 BNWORD64
const *numx
= num
; /* Working copy of the argument */
1068 unsigned lenx
= len
; /* Working copy of the argument */
1073 /* First, store all the squares */
1077 t
= BIGLITTLE(*--numx
,*numx
++);
1078 mul64_ppmm(ph
,pl
,t
,t
);
1079 BIGLITTLE(*--prodx
,*prodx
++) = pl
;
1080 BIGLITTLE(*--prodx
,*prodx
++) = ph
;
1081 #elif defined(BNWORD128) /* use BNWORD128 */
1083 t
= BIGLITTLE(*--numx
,*numx
++);
1084 p
= (BNWORD128
)t
* t
;
1085 BIGLITTLE(*--prodx
,*prodx
++) = (BNWORD64
)p
;
1086 BIGLITTLE(*--prodx
,*prodx
++) = (BNWORD64
)(p
>>64);
1087 #else /* Use lbnMulN1_64 */
1088 t
= BIGLITTLE(numx
[-1],*numx
);
1089 lbnMulN1_64(prodx
, numx
, 1, t
);
1090 BIGLITTLE(--numx
,numx
++);
1091 BIGLITTLE(prodx
-= 2, prodx
+= 2);
1094 /* Then, shift right 1 bit */
1095 (void)lbnRshift_64(prod
, 2*len
, 1);
1097 /* Then, add in the off-diagonal sums */
1102 t
= BIGLITTLE(*--numx
,*numx
++);
1103 BIGLITTLE(--prodx
,prodx
++);
1104 t
= lbnMulAdd1_64(prodx
, numx
, lenx
, t
);
1105 lbnAdd1_64(BIGLITTLE(prodx
-lenx
,prodx
+lenx
), lenx
+1, t
);
1106 BIGLITTLE(--prodx
,prodx
++);
1109 /* Shift it back up */
1110 lbnDouble_64(prod
, 2*len
);
1112 /* And set the low bit appropriately */
1113 BIGLITTLE(prod
[-1],prod
[0]) |= BIGLITTLE(num
[-1],num
[0]) & 1;
1115 #endif /* !lbnSquare_64 */
1118 * lbnNorm_64 - given a number, return a modified length such that the
1119 * most significant digit is non-zero. Zero-length input is okay.
1123 lbnNorm_64(BNWORD64
const *num
, unsigned len
)
1125 BIGLITTLE(num
-= len
,num
+= len
);
1126 while (len
&& BIGLITTLE(*num
++,*--num
) == 0)
1130 #endif /* lbnNorm_64 */
1133 * lbnBits_64 - return the number of significant bits in the array.
1134 * It starts by normalizing the array. Zero-length input is okay.
1135 * Then assuming there's anything to it, it fetches the high word,
1136 * generates a bit length by multiplying the word length by 64, and
1137 * subtracts off 64/2, 64/4, 64/8, ... bits if the high bits are clear.
1141 lbnBits_64(BNWORD64
const *num
, unsigned len
)
1146 len
= lbnNorm_64(num
, len
);
1148 t
= BIGLITTLE(*(num
-len
),*(num
+(len
-1)));
1157 } while ((i
/= 2) != 0);
1161 #endif /* lbnBits_64 */
1164 * If defined, use hand-rolled divide rather than compiler's native.
1165 * If the machine doesn't do it in line, the manual code is probably
1166 * faster, since it can assume normalization and the fact that the
1167 * quotient will fit into 64 bits, which a general 128-bit divide
1168 * in a compiler's run-time library can't do.
1170 #ifndef BN_SLOW_DIVIDE_128
1171 /* Assume that divisors of more than thirty-two bits are slow */
1172 #define BN_SLOW_DIVIDE_128 (128 > 0x20)
1176 * Return (nh<<64|nl) % d, and place the quotient digit into *q.
1177 * It is guaranteed that nh < d, and that d is normalized (with its high
1178 * bit set). If we have a double-width type, it's easy. If not, ooh,
1182 #if defined(BNWORD128) && !BN_SLOW_DIVIDE_128
1184 lbnDiv21_64(BNWORD64
*q
, BNWORD64 nh
, BNWORD64 nl
, BNWORD64 d
)
1186 BNWORD128 n
= (BNWORD128
)nh
<< 64 | nl
;
1188 /* Divisor must be normalized */
1189 assert(d
>> (64-1) == 1);
1196 * This is where it gets ugly.
1198 * Do the division in two halves, using Algorithm D from section 4.3.1
1199 * of Knuth. Note Theorem B from that section, that the quotient estimate
1200 * is never more than the true quotient, and is never more than two
1203 * The mapping onto conventional long division is (everything a half word):
1204 * _____________qh___ql_
1205 * dh dl ) nh.h nh.l nl.h nl.l
1213 * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
1214 * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
1215 * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
1216 * low part of the subtractor, qh * dl. This also needs to be subtracted
1217 * from (nh.h nh.l nl.h) to get the final remainder. So we take the
1218 * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
1219 * try to subtract qh * dl from that. Since the remainder is 1/2-word
1220 * long, shifting and adding nl.h results in a single word r.
1221 * It is possible that the remainder we're working with, r, is less than
1222 * the product qh * dl, if we estimated qh too high. The estimation
1223 * technique can produce a qh that is too large (never too small), leading
1224 * to r which is too small. In that case, decrement the digit qh, add
1225 * shifted dh to r (to correct for that error), and subtract dl from the
1226 * product we're comparing r with. That's the "correct" way to do it, but
1227 * just adding dl to r instead of subtracting it from the product is
1228 * equivalent and a lot simpler. You just have to watch out for overflow.
1230 * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
1233 * The various uses of 64/2 for shifts are because of the note about
1234 * automatic editing of this file at the very top of the file.
1236 #define highhalf(x) ( (x) >> 64/2 )
1237 #define lowhalf(x) ( (x) & (((BNWORD64)1 << 64/2)-1) )
1239 lbnDiv21_64(BNWORD64
*q
, BNWORD64 nh
, BNWORD64 nl
, BNWORD64 d
)
1241 BNWORD64 dh
= highhalf(d
), dl
= lowhalf(d
);
1242 BNWORD64 qh
, ql
, prod
, r
;
1244 /* Divisor must be normalized */
1245 assert((d
>> (64-1)) == 1);
1247 /* Do first half-word of division */
1253 * Add next half-word of numerator to remainder and correct.
1254 * qh may be up to two too large.
1256 r
= (r
<< (64/2)) | highhalf(nl
);
1259 if (r
>= d
&& r
< prod
) {
1265 /* Do second half-word of division */
1270 r
= (r
<< (64/2)) | lowhalf(nl
);
1273 if (r
>= d
&& r
< prod
) {
1279 *q
= (qh
<< (64/2)) | ql
;
1284 #endif /* lbnDiv21_64 */
1288 * In the division functions, the dividend and divisor are referred to
1289 * as "n" and "d", which stand for "numerator" and "denominator".
1291 * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
1292 * the high (nlen-dlen) words of the dividend, but one extra word is needed
1293 * on top to hold the top word.
1297 * Divide an n-word number by a 1-word number, storing the remainder
1298 * and n-1 words of the n-word quotient. The high word is returned.
1299 * It IS legal for rem to point to the same address as n, and for
1300 * q to point one word higher.
1302 * TODO: If BN_SLOW_DIVIDE_128, add a divnhalf_64 which uses 64-bit
1303 * dividends if the divisor is half that long.
1304 * TODO: Shift the dividend on the fly to avoid the last division and
1305 * instead have a remainder that needs shifting.
1306 * TODO: Use reciprocals rather than dividing.
1310 lbnDiv1_64(BNWORD64
*q
, BNWORD64
*rem
, BNWORD64
const *n
, unsigned len
,
1335 } while ((xlen
/= 2) != 0);
1336 assert((d
>> (64-1-shift
)) == 1);
1339 BIGLITTLE(q
-= len
-1,q
+= len
-1);
1340 BIGLITTLE(n
-= len
,n
+= len
);
1342 r
= BIGLITTLE(*n
++,*--n
);
1352 r
= lbnDiv21_64(BIGLITTLE(q
++,--q
), r
, BIGLITTLE(*n
++,*--n
), d
);
1355 * Final correction for shift - shift the quotient up "shift"
1356 * bits, and merge in the extra bits of quotient. Then reduce
1357 * the final remainder mod the real d.
1361 qhigh
= (qhigh
<< shift
) | lbnLshift_64(q
, len
-1, shift
);
1362 BIGLITTLE(q
[-1],*q
) |= r
/d
;
1372 * This function performs a "quick" modulus of a number with a divisor
1373 * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
1374 * This applies regardless of the word size the library is compiled with.
1376 * This function is important to prime generation, for sieving.
1379 /* If there's a custom lbnMod21_64, no normalization needed */
1382 lbnModQ_64(BNWORD64
const *n
, unsigned len
, unsigned d
)
1389 BIGLITTLE(n
-= len
,n
+= len
);
1391 /* Try using a compare to avoid the first divide */
1392 r
= BIGLITTLE(*n
++,*--n
);
1396 r
= lbnMod21_64(r
, BIGLITTLE(*n
++,*--n
), d
);
1400 #elif defined(BNWORD128) && !BN_SLOW_DIVIDE_128
1402 lbnModQ_64(BNWORD64
const *n
, unsigned len
, unsigned d
)
1407 return BIGLITTLE(n
[-1],n
[0]) % d
;
1409 BIGLITTLE(n
-= len
,n
+= len
);
1410 r
= BIGLITTLE(n
[-1],n
[0]);
1413 r
= (BNWORD64
)((((BNWORD128
)r
<<64) | BIGLITTLE(*n
++,*--n
)) % d
);
1420 * If the single word size can hold 65535*65536, then this function
1424 #define highhalf(x) ( (x) >> 64/2 )
1425 #define lowhalf(x) ( (x) & ((1 << 64/2)-1) )
1428 lbnModQ_64(BNWORD64
const *n
, unsigned len
, unsigned d
)
1432 BIGLITTLE(n
-= len
,n
+= len
);
1434 r
= BIGLITTLE(*n
++,*--n
);
1436 x
= BIGLITTLE(*n
++,*--n
);
1437 r
= (r
%d
<< 64/2) | highhalf(x
);
1438 r
= (r
%d
<< 64/2) | lowhalf(x
);
1444 /* Default case - use lbnDiv21_64 */
1446 lbnModQ_64(BNWORD64
const *n
, unsigned len
, unsigned d
)
1463 assert(d
>> (64-1-shift
) == 1);
1466 BIGLITTLE(n
-= len
,n
+= len
);
1468 r
= BIGLITTLE(*n
++,*--n
);
1473 r
= lbnDiv21_64(&q
, r
, BIGLITTLE(*n
++,*--n
), d
);
1476 * Final correction for shift - shift the quotient up "shift"
1477 * bits, and merge in the extra bits of quotient. Then reduce
1478 * the final remainder mod the real d.
1486 #endif /* lbnModQ_64 */
1489 * Reduce n mod d and return the quotient. That is, find:
1492 * d is altered during the execution of this subroutine by normalizing it.
1493 * It must already have its most significant word non-zero; it is shifted
1494 * so its most significant bit is non-zero.
1496 * The quotient q is nlen-dlen+1 words long. To make it possible to
1497 * overlap the quptient with the input (you can store it in the high dlen
1498 * words), the high word of the quotient is *not* stored, but is returned.
1499 * (If all you want is the remainder, you don't care about it, anyway.)
1501 * This uses algorithm D from Knuth (4.3.1), except that we do binary
1502 * (shift) normalization of the divisor. WARNING: This is hairy!
1504 * This function is used for some modular reduction, but it is not used in
1505 * the modular exponentiation loops; they use Montgomery form and the
1506 * corresponding, more efficient, Montgomery reduction. This code
1507 * is needed for the conversion to Montgomery form, however, so it
1508 * has to be here and it might as well be reasonably efficient.
1510 * The overall operation is as follows ("top" and "up" refer to the
1511 * most significant end of the number; "bottom" and "down", the least):
1513 * - Shift the divisor up until the most significant bit is set.
1514 * - Shift the dividend up the same amount. This will produce the
1515 * correct quotient, and the remainder can be recovered by shifting
1516 * it back down the same number of bits. This may produce an overflow
1517 * word, but the word is always strictly less than the most significant
1519 * - Estimate the first quotient digit qhat:
1520 * - First take the top two words (one of which is the overflow) of the
1521 * dividend and divide by the top word of the divisor:
1522 * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
1523 * and, since dh is normalized, it is at most two over.
1524 * - Second, correct by comparing the top three words. If
1525 * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
1526 * The second iteration can be simpler because there can't be a third.
1527 * The computation can be simplified by subtracting dh*qhat from
1528 * both sides, suitably shifted. This reduces the left side to
1529 * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
1530 * remainder r from (nh,nm)%dh, so the right is (r,nl).
1531 * This produces qhat that is almost always correct and at
1532 * most (prob ~ 2/2^64) one too high.
1533 * - Subtract qhat times the divisor (suitably shifted) from the dividend.
1534 * If there is a borrow, qhat was wrong, so decrement it
1535 * and add the divisor back in (once).
1536 * - Store the final quotient digit qhat in the quotient array q.
1538 * Repeat the quotient digit computation for successive digits of the
1539 * quotient until the whole quotient has been computed. Then shift the
1540 * divisor and the remainder down to correct for the normalization.
1542 * TODO: Special case 2-word divisors.
1543 * TODO: Use reciprocals rather than dividing.
1547 lbnDiv_64(BNWORD64
*q
, BNWORD64
*n
, unsigned nlen
, BNWORD64
*d
, unsigned dlen
)
1549 BNWORD64 nh
,nm
,nl
; /* Top three words of the dividend */
1550 BNWORD64 dh
,dl
; /* Top two words of the divisor */
1551 BNWORD64 qhat
; /* Extimate of quotient word */
1552 BNWORD64 r
; /* Remainder from quotient estimate division */
1553 BNWORD64 qhigh
; /* High word of quotient */
1554 unsigned i
; /* Temp */
1555 unsigned shift
; /* Bits shifted by normalization */
1556 unsigned qlen
= nlen
-dlen
; /* Size of quotient (less 1) */
1559 #elif defined(BNWORD128)
1561 #else /* use lbnMulN1_64 */
1563 #define t2high BIGLITTLE(t2[0],t2[1])
1564 #define t2low BIGLITTLE(t2[1],t2[0])
1568 assert(nlen
>= dlen
);
1571 * Special cases for short divisors. The general case uses the
1572 * top top 2 digits of the divisor (d) to estimate a quotient digit,
1573 * so it breaks if there are fewer digits available. Thus, we need
1574 * special cases for a divisor of length 1. A divisor of length
1575 * 2 can have a *lot* of administrivia overhead removed removed,
1576 * so it's probably worth special-casing that case, too.
1579 return lbnDiv1_64(q
, BIGLITTLE(n
-1,n
), n
, nlen
,
1580 BIGLITTLE(d
[-1],d
[0]));
1584 * @@@ This is not yet written... The general loop will do,
1585 * albeit less efficiently
1589 * divisor two digits long:
1590 * use the 3/2 technique from Knuth, but we know
1593 dh
= BIGLITTLE(d
[-1],d
[0]);
1594 dl
= BIGLITTLE(d
[-2],d
[1]);
1596 if ((sh
& ((BNWORD64
)1 << 64-1-shift
)) == 0) {
1599 } while (dh
& (BNWORD64
)1<<64-1-shift
) == 0);
1600 dh
= dh
<< shift
| dl
>> (64-shift
);
1607 for (shift
= 0; (dh
& (BNWORD64
)1 << 64-1-shift
)) == 0; shift
++)
1611 dh
= dh
<< shift
| dl
>> (64-shift
);
1617 dh
= BIGLITTLE(*(d
-dlen
),*(d
+(dlen
-1)));
1620 /* Normalize the divisor */
1629 } while ((i
/= 2) != 0);
1633 lbnLshift_64(d
, dlen
, shift
);
1634 dh
= BIGLITTLE(*(d
-dlen
),*(d
+(dlen
-1)));
1635 nh
= lbnLshift_64(n
, nlen
, shift
);
1638 /* Assert that dh is now normalized */
1639 assert(dh
>> (64-1));
1641 /* Also get the second-most significant word of the divisor */
1642 dl
= BIGLITTLE(*(d
-(dlen
-1)),*(d
+(dlen
-2)));
1645 * Adjust pointers: n to point to least significant end of first
1646 * first subtract, and q to one the most-significant end of the
1649 BIGLITTLE(n
-= qlen
,n
+= qlen
);
1650 BIGLITTLE(q
-= qlen
,q
+= qlen
);
1652 /* Fetch the most significant stored word of the dividend */
1653 nm
= BIGLITTLE(*(n
-dlen
),*(n
+(dlen
-1)));
1656 * Compute the first digit of the quotient, based on the
1657 * first two words of the dividend (the most significant of which
1658 * is the overflow word h).
1662 r
= lbnDiv21_64(&qhat
, nh
, nm
, dh
);
1663 } else if (nm
>= dh
) {
1666 } else { /* Quotient is zero */
1671 /* Now get the third most significant word of the dividend */
1672 nl
= BIGLITTLE(*(n
-(dlen
-1)),*(n
+(dlen
-2)));
1675 * Correct qhat, the estimate of quotient digit.
1676 * qhat can only be high, and at most two words high,
1677 * so the loop can be unrolled and abbreviated.
1680 mul64_ppmm(nm
, t64
, qhat
, dl
);
1681 if (nm
> r
|| (nm
== r
&& t64
> nl
)) {
1682 /* Decrement qhat and adjust comparison parameters */
1684 if ((r
+= dh
) >= dh
) {
1687 if (nm
> r
|| (nm
== r
&& t64
> nl
))
1691 #elif defined(BNWORD128)
1692 t128
= (BNWORD128
)qhat
* dl
;
1693 if (t128
> ((BNWORD128
)r
<< 64) + nl
) {
1694 /* Decrement qhat and adjust comparison parameters */
1696 if ((r
+= dh
) > dh
) {
1698 if (t128
> ((BNWORD128
)r
<< 64) + nl
)
1702 #else /* Use lbnMulN1_64 */
1703 lbnMulN1_64(BIGLITTLE(t2
+2,t2
), &dl
, 1, qhat
);
1704 if (t2high
> r
|| (t2high
== r
&& t2low
> nl
)) {
1705 /* Decrement qhat and adjust comparison parameters */
1707 if ((r
+= dh
) >= dh
) {
1708 t2high
-= (t2low
< dl
);
1710 if (t2high
> r
|| (t2high
== r
&& t2low
> nl
))
1716 /* Do the multiply and subtract */
1717 r
= lbnMulSub1_64(n
, d
, dlen
, qhat
);
1718 /* If there was a borrow, add back once. */
1719 if (r
> nh
) { /* Borrow? */
1720 (void)lbnAddN_64(n
, d
, dlen
);
1724 /* Remember the first quotient digit. */
1727 /* Now, the main division loop: */
1732 nh
= BIGLITTLE(*(n
-dlen
),*(n
+(dlen
-1)));
1734 nm
= BIGLITTLE(*(n
-dlen
),*(n
+(dlen
-1)));
1737 qhat
= ~(BNWORD64
)0;
1738 /* Optimized computation of r = (nh,nm) - qhat * dh */
1744 r
= lbnDiv21_64(&qhat
, nh
, nm
, dh
);
1747 nl
= BIGLITTLE(*(n
-(dlen
-1)),*(n
+(dlen
-2)));
1749 mul64_ppmm(nm
, t64
, qhat
, dl
);
1750 if (nm
> r
|| (nm
== r
&& t64
> nl
)) {
1751 /* Decrement qhat and adjust comparison parameters */
1753 if ((r
+= dh
) >= dh
) {
1756 if (nm
> r
|| (nm
== r
&& t64
> nl
))
1760 #elif defined(BNWORD128)
1761 t128
= (BNWORD128
)qhat
* dl
;
1762 if (t128
> ((BNWORD128
)r
<<64) + nl
) {
1763 /* Decrement qhat and adjust comparison parameters */
1765 if ((r
+= dh
) >= dh
) {
1767 if (t128
> ((BNWORD128
)r
<< 64) + nl
)
1771 #else /* Use lbnMulN1_64 */
1772 lbnMulN1_64(BIGLITTLE(t2
+2,t2
), &dl
, 1, qhat
);
1773 if (t2high
> r
|| (t2high
== r
&& t2low
> nl
)) {
1774 /* Decrement qhat and adjust comparison parameters */
1776 if ((r
+= dh
) >= dh
) {
1777 t2high
-= (t2low
< dl
);
1779 if (t2high
> r
|| (t2high
== r
&& t2low
> nl
))
1786 * As a point of interest, note that it is not worth checking
1787 * for qhat of 0 or 1 and installing special-case code. These
1788 * occur with probability 2^-64, so spending 1 cycle to check
1789 * for them is only worth it if we save more than 2^15 cycles,
1790 * and a multiply-and-subtract for numbers in the 1024-bit
1791 * range just doesn't take that long.
1795 * n points to the least significant end of the substring
1796 * of n to be subtracted from. qhat is either exact or
1797 * one too large. If the subtract gets a borrow, it was
1798 * one too large and the divisor is added back in. It's
1799 * a dlen+1 word add which is guaranteed to produce a
1800 * carry out, so it can be done very simply.
1802 r
= lbnMulSub1_64(n
, d
, dlen
, qhat
);
1803 if (r
> nh
) { /* Borrow? */
1804 (void)lbnAddN_64(n
, d
, dlen
);
1807 /* Store the quotient digit */
1808 BIGLITTLE(*q
++,*--q
) = qhat
;
1813 lbnRshift_64(d
, dlen
, shift
);
1814 lbnRshift_64(n
, dlen
, shift
);
1822 * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^64.
1824 * This just performs Newton's iteration until it gets the
1825 * inverse. The initial estimate is always correct to 3 bits, and
1826 * sometimes 4. The number of valid bits doubles each iteration.
1827 * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
1828 * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
1829 * the iteration through.)
1831 #ifndef lbnMontInv1_64
1833 lbnMontInv1_64(BNWORD64
const x
)
1839 while ((z
= x
*y
) != 1)
1843 #endif /* !lbnMontInv1_64 */
1845 #if defined(BNWORD128) && PRODUCT_SCAN
1847 * Test code for product-scanning Montgomery reduction.
1848 * This seems to slow the C code down rather than speed it up.
1850 * The first loop computes the Montgomery multipliers, storing them over
1851 * the low half of the number n.
1853 * The second half multiplies the upper half, adding in the modulus
1854 * times the Montgomery multipliers. The results of this multiply
1858 lbnMontReduce_64(BNWORD64
*n
, BNWORD64
const *mod
, unsigned mlen
, BNWORD64 inv
)
1867 /* Special case of zero */
1871 /* Pass 1 - compute Montgomery multipliers */
1872 /* First iteration can have certain simplifications. */
1873 t
= BIGLITTLE(n
[-1],n
[0]);
1876 BIGLITTLE(n
[-1], n
[0]) = t
;
1877 x
+= (BNWORD128
)t
* BIGLITTLE(mod
[-1],mod
[0]); /* Can't overflow */
1878 assert((BNWORD64
)x
== 0);
1881 for (i
= 1; i
< mlen
; i
++) {
1884 pm
= BIGLITTLE(mod
-i
-1,mod
+i
+1);
1885 for (j
= 0; j
< i
; j
++) {
1886 y
= (BNWORD128
)BIGLITTLE(*--pn
* *pm
++, *pn
++ * *--pm
);
1890 assert(BIGLITTLE(pn
== n
-i
, pn
== n
+i
));
1891 y
= t
= BIGLITTLE(pn
[-1], pn
[0]);
1894 BIGLITTLE(pn
[-1], pn
[0]) = t
= inv
* (BNWORD64
)x
;
1895 assert(BIGLITTLE(pm
== mod
-1, pm
== mod
+1));
1896 y
= (BNWORD128
)t
* BIGLITTLE(pm
[0],pm
[-1]);
1899 assert((BNWORD64
)x
== 0);
1900 x
= x
>> 64 | (BNWORD128
)carry
<< 64;
1903 BIGLITTLE(n
-= mlen
, n
+= mlen
);
1905 /* Pass 2 - compute upper words and add to n */
1906 for (i
= 1; i
< mlen
; i
++) {
1908 pm
= BIGLITTLE(mod
-i
,mod
+i
);
1910 for (j
= i
; j
< mlen
; j
++) {
1911 y
= (BNWORD128
)BIGLITTLE(*--pm
* *pn
++, *pm
++ * *--pn
);
1915 assert(BIGLITTLE(pm
== mod
-mlen
, pm
== mod
+mlen
));
1916 assert(BIGLITTLE(pn
== n
+mlen
-i
, pn
== n
-mlen
+i
));
1917 y
= t
= BIGLITTLE(*(n
-i
),*(n
+i
-1));
1920 BIGLITTLE(*(n
-i
),*(n
+i
-1)) = (BNWORD64
)x
;
1921 x
= (x
>> 64) | (BNWORD128
)carry
<< 64;
1924 /* Last round of second half, simplified. */
1925 t
= BIGLITTLE(*(n
-mlen
),*(n
+mlen
-1));
1927 BIGLITTLE(*(n
-mlen
),*(n
+mlen
-1)) = (BNWORD64
)x
;
1928 carry
= (unsigned)(x
>> 64);
1931 carry
-= lbnSubN_64(n
, mod
, mlen
);
1932 while (lbnCmp_64(n
, mod
, mlen
) >= 0)
1933 (void)lbnSubN_64(n
, mod
, mlen
);
1935 #define lbnMontReduce_64 lbnMontReduce_64
1939 * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
1940 * 2^(64*mlen). Returns the result in the *top* mlen words of the argument n.
1941 * This is ready for another multiplication using lbnMul_64.
1943 * Montgomery representation is a very useful way to encode numbers when
1944 * you're doing lots of modular reduction. What you do is pick a multiplier
1945 * R which is relatively prime to the modulus and very easy to divide by.
1946 * Since the modulus is odd, R is closen as a power of 2, so the division
1947 * is a shift. In fact, it's a shift of an integral number of words,
1948 * so the shift can be implicit - just drop the low-order words.
1950 * Now, choose R *larger* than the modulus m, 2^(64*mlen). Then convert
1951 * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
1952 * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
1953 * - The Montgomery form of a number depends on the modulus m.
1954 * A fixed modulus m is assumed throughout this discussion.
1955 * - Since R is relaitvely prime to m, multiplication by R is invertible;
1956 * no information about the numbers is lost, they're just scrambled.
1957 * - Adding (and subtracting) numbers in this form works just as usual.
1958 * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
1959 * - Multiplying numbers in this form produces a*b*R*R. The problem
1960 * is to divide out the excess factor of R, modulo m as well as to
1961 * reduce to the given length mlen. It turns out that this can be
1962 * done *faster* than a normal divide, which is where the speedup
1963 * in Montgomery division comes from.
1965 * Normal reduction chooses a most-significant quotient digit q and then
1966 * subtracts q*m from the number to be reduced. Choosing q is tricky
1967 * and involved (just look at lbnDiv_64 to see!) and is usually
1968 * imperfect, requiring a check for correction after the subtraction.
1970 * Montgomery reduction *adds* a multiple of m to the *low-order* part
1971 * of the number to be reduced. This multiple is chosen to make the
1972 * low-order part of the number come out to zero. This can be done
1973 * with no trickery or error using a precomputed inverse of the modulus.
1974 * In this code, the "part" is one word, but any width can be used.
1976 * Repeating this step sufficiently often results in a value which
1977 * is a multiple of R (a power of two, remember) but is still (since
1978 * the additions were to the low-order part and thus did not increase
1979 * the value of the number being reduced very much) still not much
1980 * larger than m*R. Then implicitly divide by R and subtract off
1981 * m until the result is in the correct range.
1983 * Since the low-order part being cancelled is less than R, the
1984 * multiple of m added must have a multiplier which is at most R-1.
1985 * Assuming that the input is at most m*R-1, the final number is
1986 * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
1987 * the high-order part, equivalent to subtracting m*R from the
1988 * while number, produces a result which is at most m*R - m - 1,
1989 * which divided by R is at most m-1.
1991 * To convert *to* Montgomery form, you need a regular remainder
1992 * routine, although you can just compute R*R (mod m) and do the
1993 * conversion using Montgomery multiplication. To convert *from*
1994 * Montgomery form, just Montgomery reduce the number to
1995 * remove the extra factor of R.
1997 * TODO: Change to a full inverse and use Karatsuba's multiplication
1998 * rather than this word-at-a-time.
2000 #ifndef lbnMontReduce_64
2002 lbnMontReduce_64(BNWORD64
*n
, BNWORD64
const *mod
, unsigned const mlen
,
2007 unsigned len
= mlen
;
2009 /* inv must be the negative inverse of mod's least significant word */
2010 assert((BNWORD64
)(inv
* BIGLITTLE(mod
[-1],mod
[0])) == (BNWORD64
)-1);
2015 t
= lbnMulAdd1_64(n
, mod
, mlen
, inv
* BIGLITTLE(n
[-1],n
[0]));
2016 c
+= lbnAdd1_64(BIGLITTLE(n
-mlen
,n
+mlen
), len
, t
);
2021 * All that adding can cause an overflow past the modulus size,
2022 * but it's unusual, and never by much, so a subtraction loop
2023 * is the right way to deal with it.
2024 * This subtraction happens infrequently - I've only ever seen it
2025 * invoked once per reduction, and then just under 22.5% of the time.
2028 c
-= lbnSubN_64(n
, mod
, mlen
);
2029 while (lbnCmp_64(n
, mod
, mlen
) >= 0)
2030 (void)lbnSubN_64(n
, mod
, mlen
);
2032 #endif /* !lbnMontReduce_64 */
2035 * A couple of helpers that you might want to implement atomically
2038 #ifndef lbnMontMul_64
2040 * Multiply "num1" by "num2", modulo "mod", all of length "len", and
2041 * place the result in the high half of "prod". "inv" is the inverse
2042 * of the least-significant word of the modulus, modulo 2^64.
2043 * This uses numbers in Montgomery form. Reduce using "len" and "inv".
2045 * This is implemented as a macro to win on compilers that don't do
2046 * inlining, since it's so trivial.
2048 #define lbnMontMul_64(prod, n1, n2, mod, len, inv) \
2049 (lbnMulX_64(prod, n1, n2, len), lbnMontReduce_64(prod, mod, len, inv))
2050 #endif /* !lbnMontMul_64 */
2052 #ifndef lbnMontSquare_64
2054 * Square "n", modulo "mod", both of length "len", and place the result
2055 * in the high half of "prod". "inv" is the inverse of the least-significant
2056 * word of the modulus, modulo 2^64.
2057 * This uses numbers in Montgomery form. Reduce using "len" and "inv".
2059 * This is implemented as a macro to win on compilers that don't do
2060 * inlining, since it's so trivial.
2062 #define lbnMontSquare_64(prod, n, mod, len, inv) \
2063 (lbnSquare_64(prod, n, len), lbnMontReduce_64(prod, mod, len, inv))
2065 #endif /* !lbnMontSquare_64 */
2068 * Convert a number to Montgomery form - requires mlen + nlen words
2072 lbnToMont_64(BNWORD64
*n
, unsigned nlen
, BNWORD64
*mod
, unsigned mlen
)
2074 /* Move n up "mlen" words */
2075 lbnCopy_64(BIGLITTLE(n
-mlen
,n
+mlen
), n
, nlen
);
2076 lbnZero_64(n
, mlen
);
2077 /* Do the division - dump the quotient in the high-order words */
2078 (void)lbnDiv_64(BIGLITTLE(n
-mlen
,n
+mlen
), n
, mlen
+nlen
, mod
, mlen
);
2082 * Convert from Montgomery form. Montgomery reduction is all that is
2086 lbnFromMont_64(BNWORD64
*n
, BNWORD64
*mod
, unsigned len
)
2088 /* Zero the high words of n */
2089 lbnZero_64(BIGLITTLE(n
-len
,n
+len
), len
);
2090 lbnMontReduce_64(n
, mod
, len
, lbnMontInv1_64(mod
[BIGLITTLE(-1,0)]));
2091 /* Move n down len words */
2092 lbnCopy_64(n
, BIGLITTLE(n
-len
,n
+len
), len
);
2096 * The windowed exponentiation algorithm, precomputes a table of odd
2097 * powers of n up to 2^k. See the comment in bnExpMod_64 below for
2098 * an explanation of how it actually works works.
2100 * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
2101 * multiplies (on average) to perform the exponentiation. To minimize
2102 * the sum, k must vary with e. The optimal window sizes vary with the
2103 * exponent length. Here are some selected values and the boundary cases.
2104 * (An underscore _ has been inserted into some of the numbers to ensure
2105 * that magic strings like 64 do not appear in this table. It should be
2108 * At e = 1 bits, k=1 (0.000000) is best
2109 * At e = 2 bits, k=1 (0.500000) is best
2110 * At e = 4 bits, k=1 (1.500000) is best
2111 * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
2112 * At e = 1_6 bits, k=2 (6.000000) is best
2113 * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
2114 * At e = 3_2 bits, k=3 (10.750000) is best
2115 * At e = 6_4 bits, k=3 (18.750000) is best
2116 * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
2117 * At e = 128 bits, k=4 (3_2.400000) is best
2118 * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
2119 * At e = 256 bits, k=5 (57.500000) is best
2120 * At e = 512 bits, k=5 (100.1_66667) is best
2121 * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
2122 * At e = 1024 bits, k=6 (177.142857) is best
2123 * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
2124 * At e = 2048 bits, k=7 (318.875000) is best
2125 * At e = 4096 bits, k=7 (574.875000) is best
2127 * The numbers in parentheses are the expected number of multiplications
2128 * needed to do the computation. The normal russian-peasant modular
2129 * exponentiation technique always uses (e-1)/2. For exponents as
2130 * small as 192 bits (below the range of current factoring algorithms),
2131 * half of the multiplies are eliminated, 45.2 as opposed to the naive
2132 * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
2133 * proper is just over half of multiplying, but the Montgomery
2134 * reduction in each case is also a multiply), that's 143.25
2135 * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
2136 * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
2137 * 24.3% savings. It asymptotically approaches 25%.
2139 * Um, actually there's a slightly more accurate way to count, which
2140 * really is the average number of multiplies required, averaged
2141 * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
2142 * It's based on the recurrence that for the last b bits, b <= k, at
2143 * most one multiply is needed (and none at all 1/2^b of the time),
2144 * while when b > k, the odds are 1/2 each way that the bit will be
2145 * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
2146 * 1/2 that the bit will be 1, starting a k-bit window and requiring
2147 * 1 multiply beyond the b-k-bit case. Since the most significant
2148 * bit is always 1, a k-bit window always starts there, and that
2149 * multiply is by 1, so it isn't a multiply at all. Thus, the
2150 * number of multiplies is simply that needed for the last e-k bits.
2151 * This recurrence produces:
2153 * At e = 1 bits, k=1 (0.000000) is best
2154 * At e = 2 bits, k=1 (0.500000) is best
2155 * At e = 4 bits, k=1 (1.500000) is best
2156 * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
2157 * At e = 8 bits, k=2 (3.109375) is best
2158 * At e = 1_6 bits, k=2 (5.777771) is best
2159 * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
2160 * At e = 3_2 bits, k=3 (10.437492) is best
2161 * At e = 6_4 bits, k=3 (18.437500) is best
2162 * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
2163 * At e = 128 bits, k=4 (3_2.040000) is best
2164 * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
2165 * At e = 256 bits, k=5 (57.111111) is best
2166 * At e = 512 bits, k=5 (99.777778) is best
2167 * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
2168 * At e = 1024 bits, k=6 (176.734694) is best
2169 * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
2170 * At e = 2048 bits, k=7 (318.453125) is best
2171 * At e = 4096 bits, k=7 (574.453125) is best
2173 * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
2174 * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
2175 * (The numbers past that are k=8 at 4609 and k=9 at 11521,
2176 * vs. one more in each case for the approximation.)
2178 * Given that exponents for which k>7 are useful are uncommon,
2179 * a fixed size table for k <= 7 is used for simplicity.
2181 * The basic number of squarings needed is e-1, although a k-bit
2182 * window (for k > 1) can save, on average, k-2 of those, too.
2183 * That savings currently isn't counted here. It would drive the
2184 * crossover points slightly lower.
2185 * (Actually, this win is also reduced in the DoubleExpMod case,
2186 * meaning we'd have to split the tables. Except for that, the
2187 * multiplies by powers of the two bases are independent, so
2188 * the same logic applies to each as the single case.)
2190 * Table entry i is the largest number of bits in an exponent to
2191 * process with a window size of i+1. Entry 6 is the largest
2192 * possible unsigned number, so the window will never be more
2193 * than 7 bits, requiring 2^6 = 0x40 slots.
2195 #define BNEXPMOD_MAX_WINDOW 7
2196 static unsigned const bnExpModThreshTable
[BNEXPMOD_MAX_WINDOW
] = {
2197 5, 23, 80, 240, 672, 1792, (unsigned)-1
2198 /* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
2202 * Perform modular exponentiation, as fast as possible! This uses
2203 * Montgomery reduction, optimized squaring, and windowed exponentiation.
2204 * The modulus "mod" MUST be odd!
2206 * This returns 0 on success, -1 on out of memory.
2208 * The window algorithm:
2209 * The idea is to keep a running product of b1 = n^(high-order bits of exp),
2210 * and then keep appending exponent bits to it. The following patterns
2211 * apply to a 3-bit window (k = 3):
2212 * To append 0: square
2213 * To append 1: square, multiply by n^1
2214 * To append 10: square, multiply by n^1, square
2215 * To append 11: square, square, multiply by n^3
2216 * To append 100: square, multiply by n^1, square, square
2217 * To append 101: square, square, square, multiply by n^5
2218 * To append 110: square, square, multiply by n^3, square
2219 * To append 111: square, square, square, multiply by n^7
2221 * Since each pattern involves only one multiply, the longer the pattern
2222 * the better, except that a 0 (no multiplies) can be appended directly.
2223 * We precompute a table of odd powers of n, up to 2^k, and can then
2224 * multiply k bits of exponent at a time. Actually, assuming random
2225 * exponents, there is on average one zero bit between needs to
2226 * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
2227 * 1/8 of the time, there's 2, 1/64 of the time, there's 3, etc.), so
2228 * you have to do one multiply per k+1 bits of exponent.
2230 * The loop walks down the exponent, squaring the result buffer as
2231 * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
2232 * filled with the upcoming exponent bits. (What is read after the
2233 * end of the exponent is unimportant, but it is filled with zero here.)
2234 * When the most-significant bit of this buffer becomes set, i.e.
2235 * (buf & tblmask) != 0, we have to decide what pattern to multiply
2236 * by, and when to do it. We decide, remember to do it in future
2237 * after a suitable number of squarings have passed (e.g. a pattern
2238 * of "100" in the buffer requires that we multiply by n^1 immediately;
2239 * a pattern of "110" calls for multiplying by n^3 after one more
2240 * squaring), clear the buffer, and continue.
2242 * When we start, there is one more optimization: the result buffer
2243 * is implcitly one, so squaring it or multiplying by it can be
2244 * optimized away. Further, if we start with a pattern like "100"
2245 * in the lookahead window, rather than placing n into the buffer
2246 * and then starting to square it, we have already computed n^2
2247 * to compute the odd-powers table, so we can place that into
2248 * the buffer and save a squaring.
2250 * This means that if you have a k-bit window, to compute n^z,
2251 * where z is the high k bits of the exponent, 1/2 of the time
2252 * it requires no squarings. 1/4 of the time, it requires 1
2253 * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
2254 * And the remaining 1/2^(k-1) of the time, the top k bits are a
2255 * 1 followed by k-1 0 bits, so it again only requires k-2
2256 * squarings, not k-1. The average of these is 1. Add that
2257 * to the one squaring we have to do to compute the table,
2258 * and you'll see that a k-bit window saves k-2 squarings
2259 * as well as reducing the multiplies. (It actually doesn't
2260 * hurt in the case k = 1, either.)
2262 * n must have mlen words allocated. Although fewer may be in use
2263 * when n is passed in, all are in use on exit.
2266 lbnExpMod_64(BNWORD64
*result
, BNWORD64
const *n
, unsigned nlen
,
2267 BNWORD64
const *e
, unsigned elen
, BNWORD64
*mod
, unsigned mlen
)
2269 BNWORD64
*table
[1 << (BNEXPMOD_MAX_WINDOW
-1)];
2270 /* Table of odd powers of n */
2271 unsigned ebits
; /* Exponent bits */
2272 unsigned wbits
; /* Window size */
2273 unsigned tblmask
; /* Mask of exponentiation window */
2274 BNWORD64 bitpos
; /* Mask of current look-ahead bit */
2275 unsigned buf
; /* Buffer of exponent bits */
2276 unsigned multpos
; /* Where to do pending multiply */
2277 BNWORD64
const *mult
; /* What to multiply by */
2278 unsigned i
; /* Loop counter */
2279 int isone
; /* Flag: accum. is implicitly one */
2280 BNWORD64
*a
, *b
; /* Working buffers/accumulators */
2281 BNWORD64
*t
; /* Pointer into the working buffers */
2282 BNWORD64 inv
; /* mod^-1 modulo 2^64 */
2283 int y
; /* bnYield() result */
2286 assert(nlen
<= mlen
);
2288 /* First, a couple of trivial cases. */
2289 elen
= lbnNorm_64(e
, elen
);
2292 lbnZero_64(result
, mlen
);
2293 BIGLITTLE(result
[-1],result
[0]) = 1;
2296 ebits
= lbnBits_64(e
, elen
);
2300 lbnCopy_64(result
, n
, nlen
);
2302 lbnZero_64(BIGLITTLE(result
-nlen
,result
+nlen
),
2307 /* Okay, now move the exponent pointer to the most-significant word */
2308 e
= BIGLITTLE(e
-elen
, e
+elen
-1);
2310 /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2312 while (ebits
> bnExpModThreshTable
[wbits
])
2315 /* Allocate working storage: two product buffers and the tables. */
2316 LBNALLOC(a
, BNWORD64
, 2*mlen
);
2319 LBNALLOC(b
, BNWORD64
, 2*mlen
);
2325 /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2326 tblmask
= 1u << wbits
;
2328 /* We have the result buffer available, so use it. */
2332 * Okay, we now have a minimal-sized table - expand it.
2333 * This is allowed to fail! If so, scale back the table size
2336 for (i
= 1; i
< tblmask
; i
++) {
2337 LBNALLOC(t
, BNWORD64
, mlen
);
2338 if (!t
) /* Out of memory! Quit the loop. */
2343 /* If we stopped, with i < tblmask, shrink the tables appropriately */
2344 while (tblmask
> i
) {
2348 /* Free up our overallocations */
2349 while (--i
> tblmask
)
2350 LBNFREE(table
[i
], mlen
);
2352 /* Okay, fill in the table */
2354 /* Compute the necessary modular inverse */
2355 inv
= lbnMontInv1_64(mod
[BIGLITTLE(-1,0)]); /* LSW of modulus */
2357 /* Convert n to Montgomery form */
2359 /* Move n up "mlen" words into a */
2360 t
= BIGLITTLE(a
-mlen
, a
+mlen
);
2361 lbnCopy_64(t
, n
, nlen
);
2362 lbnZero_64(a
, mlen
);
2363 /* Do the division - lose the quotient into the high-order words */
2364 (void)lbnDiv_64(t
, a
, mlen
+nlen
, mod
, mlen
);
2365 /* Copy into first table entry */
2366 lbnCopy_64(table
[0], a
, mlen
);
2368 /* Square a into b */
2369 lbnMontSquare_64(b
, a
, mod
, mlen
, inv
);
2371 /* Use high half of b to initialize the table */
2372 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
2373 for (i
= 1; i
< tblmask
; i
++) {
2374 lbnMontMul_64(a
, t
, table
[i
-1], mod
, mlen
, inv
);
2375 lbnCopy_64(table
[i
], BIGLITTLE(a
-mlen
, a
+mlen
), mlen
);
2377 if (bnYield
&& (y
= bnYield()) < 0)
2382 /* We might use b = n^2 later... */
2384 /* Initialze the fetch pointer */
2385 bitpos
= (BNWORD64
)1 << ((ebits
-1) & (64-1)); /* Initialize mask */
2387 /* This should point to the msbit of e */
2388 assert((*e
& bitpos
) != 0);
2391 * Pre-load the window. Becuase the window size is
2392 * never larger than the exponent size, there is no need to
2393 * detect running off the end of e in here.
2395 * The read-ahead is controlled by elen and the bitpos mask.
2396 * Note that this is *ahead* of ebits, which tracks the
2397 * most significant end of the window. The purpose of this
2398 * initialization is to get the two wbits+1 bits apart,
2399 * like they should be.
2401 * Note that bitpos and e1len together keep track of the
2402 * lookahead read pointer in the exponent that is used here.
2405 for (i
= 0; i
<= wbits
; i
++) {
2406 buf
= (buf
<< 1) | ((*e
& bitpos
) != 0);
2410 bitpos
= (BNWORD64
)1 << (64-1);
2414 assert(buf
& tblmask
);
2417 * Set the pending multiply positions to a location that will
2418 * never be encountered, thus ensuring that nothing will happen
2419 * until the need for a multiply appears and one is scheduled.
2421 multpos
= ebits
; /* A NULL value */
2422 mult
= 0; /* Force a crash if we use these */
2425 * Okay, now begins the real work. The first step is
2426 * slightly magic, so it's done outside the main loop,
2427 * but it's very similar to what's inside.
2429 ebits
--; /* Start processing the first bit... */
2433 * This is just like the multiply in the loop, except that
2434 * - We know the msbit of buf is set, and
2435 * - We have the extra value n^2 floating around.
2436 * So, do the usual computation, and if the result is that
2437 * the buffer should be multiplied by n^1 immediately
2438 * (which we'd normally then square), we multiply it
2439 * (which reduces to a copy, which reduces to setting a flag)
2440 * by n^2 and skip the squaring. Thus, we do the
2441 * multiply and the squaring in one step.
2443 assert(buf
& tblmask
);
2444 multpos
= ebits
- wbits
;
2445 while ((buf
& 1) == 0) {
2449 /* Intermediates can wrap, but final must NOT */
2450 assert(multpos
<= ebits
);
2451 mult
= table
[buf
>>1];
2454 /* Special case: use already-computed value sitting in buffer */
2455 if (multpos
== ebits
)
2459 * At this point, the buffer (which is the high half of b) holds
2460 * either 1 (implicitly, as the "isone" flag is set), or n^2.
2464 * The main loop. The procedure is:
2465 * - Advance the window
2466 * - If the most-significant bit of the window is set,
2467 * schedule a multiply for the appropriate time in the
2468 * future (may be immediately)
2469 * - Perform any pending multiples
2470 * - Check for termination
2471 * - Square the buffer
2473 * At any given time, the acumulated product is held in
2474 * the high half of b.
2479 /* Advance the window */
2480 assert(buf
< tblmask
);
2483 * This reads ahead of the current exponent position
2484 * (controlled by ebits), so we have to be able to read
2485 * past the lsb of the exponents without error.
2488 buf
|= ((*e
& bitpos
) != 0);
2492 bitpos
= (BNWORD64
)1 << (64-1);
2497 /* Examine the window for pending multiplies */
2498 if (buf
& tblmask
) {
2499 multpos
= ebits
- wbits
;
2500 while ((buf
& 1) == 0) {
2504 /* Intermediates can wrap, but final must NOT */
2505 assert(multpos
<= ebits
);
2506 mult
= table
[buf
>>1];
2510 /* If we have a pending multiply, do it */
2511 if (ebits
== multpos
) {
2512 /* Multiply by the table entry remembered previously */
2513 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
2515 /* Multiply by 1 is a trivial case */
2516 lbnCopy_64(t
, mult
, mlen
);
2519 lbnMontMul_64(a
, t
, mult
, mod
, mlen
, inv
);
2521 t
= a
; a
= b
; b
= t
;
2529 /* Square the input */
2531 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
2532 lbnMontSquare_64(a
, t
, mod
, mlen
, inv
);
2534 t
= a
; a
= b
; b
= t
;
2537 if (bnYield
&& (y
= bnYield()) < 0)
2547 /* Convert result out of Montgomery form */
2548 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
2549 lbnCopy_64(b
, t
, mlen
);
2550 lbnZero_64(t
, mlen
);
2551 lbnMontReduce_64(b
, mod
, mlen
, inv
);
2552 lbnCopy_64(result
, t
, mlen
);
2554 * Clean up - free intermediate storage.
2555 * Do NOT free table[0], which is the result
2563 LBNFREE(table
[tblmask
], mlen
);
2567 return y
; /* Success */
2571 * Compute and return n1^e1 * n2^e2 mod "mod".
2572 * result may be either input buffer, or something separate.
2573 * It must be "mlen" words long.
2575 * There is a current position in the exponents, which is kept in e1bits.
2576 * (The exponents are swapped if necessary so e1 is the longer of the two.)
2577 * At any given time, the value in the accumulator is
2578 * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
2579 * As e1bits is counted down, this is updated, by squaring it and doing
2580 * any necessary multiplies.
2581 * To decide on the necessary multiplies, two windows, each w1bits+1 bits
2582 * wide, are maintained in buf1 and buf2, which read *ahead* of the
2583 * e1bits position (with appropriate handling of the case when e1bits
2584 * drops below w1bits+1). When the most-significant bit of either window
2585 * becomes set, indicating that something needs to be multiplied by
2586 * the accumulator or it will get out of sync, the window is examined
2587 * to see which power of n1 or n2 to multiply by, and when (possibly
2588 * later, if the power is greater than 1) the multiply should take
2589 * place. Then the multiply and its location are remembered and the
2590 * window is cleared.
2592 * If we had every power of n1 in the table, the multiply would always
2593 * be w1bits steps in the future. But we only keep the odd powers,
2594 * so instead of waiting w1bits squarings and then multiplying
2595 * by n1^k, we wait w1bits-k squarings and multiply by n1.
2597 * Actually, w2bits can be less than w1bits, but the window is the same
2598 * size, to make it easier to keep track of where we're reading. The
2599 * appropriate number of low-order bits of the window are just ignored.
2602 lbnDoubleExpMod_64(BNWORD64
*result
,
2603 BNWORD64
const *n1
, unsigned n1len
,
2604 BNWORD64
const *e1
, unsigned e1len
,
2605 BNWORD64
const *n2
, unsigned n2len
,
2606 BNWORD64
const *e2
, unsigned e2len
,
2607 BNWORD64
*mod
, unsigned mlen
)
2609 BNWORD64
*table1
[1 << (BNEXPMOD_MAX_WINDOW
-1)];
2610 /* Table of odd powers of n1 */
2611 BNWORD64
*table2
[1 << (BNEXPMOD_MAX_WINDOW
-1)];
2612 /* Table of odd powers of n2 */
2613 unsigned e1bits
, e2bits
; /* Exponent bits */
2614 unsigned w1bits
, w2bits
; /* Window sizes */
2615 unsigned tblmask
; /* Mask of exponentiation window */
2616 BNWORD64 bitpos
; /* Mask of current look-ahead bit */
2617 unsigned buf1
, buf2
; /* Buffer of exponent bits */
2618 unsigned mult1pos
, mult2pos
; /* Where to do pending multiply */
2619 BNWORD64
const *mult1
, *mult2
; /* What to multiply by */
2620 unsigned i
; /* Loop counter */
2621 int isone
; /* Flag: accum. is implicitly one */
2622 BNWORD64
*a
, *b
; /* Working buffers/accumulators */
2623 BNWORD64
*t
; /* Pointer into the working buffers */
2624 BNWORD64 inv
; /* mod^-1 modulo 2^64 */
2625 int y
; /* bnYield() result */
2628 assert(n1len
<= mlen
);
2629 assert(n2len
<= mlen
);
2631 /* First, a couple of trivial cases. */
2632 e1len
= lbnNorm_64(e1
, e1len
);
2633 e2len
= lbnNorm_64(e2
, e2len
);
2635 /* Ensure that the first exponent is the longer */
2636 e1bits
= lbnBits_64(e1
, e1len
);
2637 e2bits
= lbnBits_64(e2
, e2len
);
2638 if (e1bits
< e2bits
) {
2639 i
= e1len
; e1len
= e2len
; e2len
= i
;
2640 i
= e1bits
; e1bits
= e2bits
; e2bits
= i
;
2641 t
= (BNWORD64
*)n1
; n1
= n2
; n2
= t
;
2642 t
= (BNWORD64
*)e1
; e1
= e2
; e2
= t
;
2644 assert(e1bits
>= e2bits
);
2646 /* Handle a trivial case */
2648 return lbnExpMod_64(result
, n1
, n1len
, e1
, e1len
, mod
, mlen
);
2651 /* The code below fucks up if the exponents aren't at least 2 bits */
2653 assert(e2bits
== 1);
2655 LBNALLOC(a
, BNWORD64
, n1len
+n2len
);
2659 lbnMul_64(a
, n1
, n1len
, n2
, n2len
);
2660 /* Do a direct modular reduction */
2661 if (n1len
+ n2len
>= mlen
)
2662 (void)lbnDiv_64(a
+mlen
, a
, n1len
+n2len
, mod
, mlen
);
2663 lbnCopy_64(result
, a
, mlen
);
2664 LBNFREE(a
, n1len
+n2len
);
2668 /* Okay, now move the exponent pointers to the most-significant word */
2669 e1
= BIGLITTLE(e1
-e1len
, e1
+e1len
-1);
2670 e2
= BIGLITTLE(e2
-e2len
, e2
+e2len
-1);
2672 /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2674 while (e1bits
> bnExpModThreshTable
[w1bits
])
2677 while (e2bits
> bnExpModThreshTable
[w2bits
])
2680 assert(w1bits
>= w2bits
);
2682 /* Allocate working storage: two product buffers and the tables. */
2683 LBNALLOC(a
, BNWORD64
, 2*mlen
);
2686 LBNALLOC(b
, BNWORD64
, 2*mlen
);
2692 /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2693 tblmask
= 1u << w1bits
;
2694 /* Use buf2 for its size, temporarily */
2695 buf2
= 1u << w2bits
;
2697 LBNALLOC(t
, BNWORD64
, mlen
);
2707 * Okay, we now have some minimal-sized tables - expand them.
2708 * This is allowed to fail! If so, scale back the table sizes
2709 * and proceed. We allocate both tables at the same time
2710 * so if it fails partway through, they'll both be a reasonable
2711 * size rather than one huge and one tiny.
2712 * When i passes buf2 (the number of entries in the e2 window,
2713 * which may be less than the number of entries in the e1 window),
2714 * stop allocating e2 space.
2716 for (i
= 1; i
< tblmask
; i
++) {
2717 LBNALLOC(t
, BNWORD64
, mlen
);
2718 if (!t
) /* Out of memory! Quit the loop. */
2722 LBNALLOC(t
, BNWORD64
, mlen
);
2724 LBNFREE(table1
[i
], mlen
);
2731 /* If we stopped, with i < tblmask, shrink the tables appropriately */
2732 while (tblmask
> i
) {
2736 /* Free up our overallocations */
2737 while (--i
> tblmask
) {
2739 LBNFREE(table2
[i
], mlen
);
2740 LBNFREE(table1
[i
], mlen
);
2742 /* And shrink the second window too, if needed */
2743 if (w2bits
> w1bits
) {
2749 * From now on, use the w2bits variable for the difference
2750 * between w1bits and w2bits.
2752 w2bits
= w1bits
-w2bits
;
2754 /* Okay, fill in the tables */
2756 /* Compute the necessary modular inverse */
2757 inv
= lbnMontInv1_64(mod
[BIGLITTLE(-1,0)]); /* LSW of modulus */
2759 /* Convert n1 to Montgomery form */
2761 /* Move n1 up "mlen" words into a */
2762 t
= BIGLITTLE(a
-mlen
, a
+mlen
);
2763 lbnCopy_64(t
, n1
, n1len
);
2764 lbnZero_64(a
, mlen
);
2765 /* Do the division - lose the quotient into the high-order words */
2766 (void)lbnDiv_64(t
, a
, mlen
+n1len
, mod
, mlen
);
2767 /* Copy into first table entry */
2768 lbnCopy_64(table1
[0], a
, mlen
);
2770 /* Square a into b */
2771 lbnMontSquare_64(b
, a
, mod
, mlen
, inv
);
2773 /* Use high half of b to initialize the first table */
2774 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
2775 for (i
= 1; i
< tblmask
; i
++) {
2776 lbnMontMul_64(a
, t
, table1
[i
-1], mod
, mlen
, inv
);
2777 lbnCopy_64(table1
[i
], BIGLITTLE(a
-mlen
, a
+mlen
), mlen
);
2779 if (bnYield
&& (y
= bnYield()) < 0)
2784 /* Convert n2 to Montgomery form */
2786 t
= BIGLITTLE(a
-mlen
, a
+mlen
);
2787 /* Move n2 up "mlen" words into a */
2788 lbnCopy_64(t
, n2
, n2len
);
2789 lbnZero_64(a
, mlen
);
2790 /* Do the division - lose the quotient into the high-order words */
2791 (void)lbnDiv_64(t
, a
, mlen
+n2len
, mod
, mlen
);
2792 /* Copy into first table entry */
2793 lbnCopy_64(table2
[0], a
, mlen
);
2795 /* Square it into a */
2796 lbnMontSquare_64(a
, table2
[0], mod
, mlen
, inv
);
2797 /* Copy to b, low half */
2798 lbnCopy_64(b
, t
, mlen
);
2800 /* Use b to initialize the second table */
2801 for (i
= 1; i
< buf2
; i
++) {
2802 lbnMontMul_64(a
, b
, table2
[i
-1], mod
, mlen
, inv
);
2803 lbnCopy_64(table2
[i
], t
, mlen
);
2805 if (bnYield
&& (y
= bnYield()) < 0)
2811 * Okay, a recap: at this point, the low part of b holds
2812 * n2^2, the high part holds n1^2, and the tables are
2813 * initialized with the odd powers of n1 and n2 from 1
2814 * through 2*tblmask-1 and 2*buf2-1.
2816 * We might use those squares in b later, or we might not.
2819 /* Initialze the fetch pointer */
2820 bitpos
= (BNWORD64
)1 << ((e1bits
-1) & (64-1)); /* Initialize mask */
2822 /* This should point to the msbit of e1 */
2823 assert((*e1
& bitpos
) != 0);
2826 * Pre-load the windows. Becuase the window size is
2827 * never larger than the exponent size, there is no need to
2828 * detect running off the end of e1 in here.
2830 * The read-ahead is controlled by e1len and the bitpos mask.
2831 * Note that this is *ahead* of e1bits, which tracks the
2832 * most significant end of the window. The purpose of this
2833 * initialization is to get the two w1bits+1 bits apart,
2834 * like they should be.
2836 * Note that bitpos and e1len together keep track of the
2837 * lookahead read pointer in the exponent that is used here.
2838 * e2len is not decremented, it is only ever compared with
2839 * e1len as *that* is decremented.
2842 for (i
= 0; i
<= w1bits
; i
++) {
2843 buf1
= (buf1
<< 1) | ((*e1
& bitpos
) != 0);
2845 buf2
= (buf2
<< 1) | ((*e2
& bitpos
) != 0);
2848 BIGLITTLE(e1
++,e1
--);
2850 BIGLITTLE(e2
++,e2
--);
2851 bitpos
= (BNWORD64
)1 << (64-1);
2855 assert(buf1
& tblmask
);
2858 * Set the pending multiply positions to a location that will
2859 * never be encountered, thus ensuring that nothing will happen
2860 * until the need for a multiply appears and one is scheduled.
2862 mult1pos
= mult2pos
= e1bits
; /* A NULL value */
2863 mult1
= mult2
= 0; /* Force a crash if we use these */
2866 * Okay, now begins the real work. The first step is
2867 * slightly magic, so it's done outside the main loop,
2868 * but it's very similar to what's inside.
2870 isone
= 1; /* Buffer is implicitly 1, so replace * by copy */
2871 e1bits
--; /* Start processing the first bit... */
2874 * This is just like the multiply in the loop, except that
2875 * - We know the msbit of buf1 is set, and
2876 * - We have the extra value n1^2 floating around.
2877 * So, do the usual computation, and if the result is that
2878 * the buffer should be multiplied by n1^1 immediately
2879 * (which we'd normally then square), we multiply it
2880 * (which reduces to a copy, which reduces to setting a flag)
2881 * by n1^2 and skip the squaring. Thus, we do the
2882 * multiply and the squaring in one step.
2884 assert(buf1
& tblmask
);
2885 mult1pos
= e1bits
- w1bits
;
2886 while ((buf1
& 1) == 0) {
2890 /* Intermediates can wrap, but final must NOT */
2891 assert(mult1pos
<= e1bits
);
2892 mult1
= table1
[buf1
>>1];
2895 /* Special case: use already-computed value sitting in buffer */
2896 if (mult1pos
== e1bits
)
2900 * The first multiply by a power of n2. Similar, but
2901 * we might not even want to schedule a multiply if e2 is
2902 * shorter than e1, and the window might be shorter so
2903 * we have to leave the low w2bits bits alone.
2905 if (buf2
& tblmask
) {
2906 /* Remember low-order bits for later */
2907 i
= buf2
& ((1u << w2bits
) - 1);
2909 mult2pos
= e1bits
- w1bits
+ w2bits
;
2910 while ((buf2
& 1) == 0) {
2914 assert(mult2pos
<= e1bits
);
2915 mult2
= table2
[buf2
>>1];
2918 if (mult2pos
== e1bits
) {
2919 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
2921 lbnCopy_64(t
, b
, mlen
); /* Copy low to high */
2924 lbnMontMul_64(a
, t
, b
, mod
, mlen
, inv
);
2925 t
= a
; a
= b
; b
= t
;
2931 * At this point, the buffer (which is the high half of b)
2932 * holds either 1 (implicitly, as the "isone" flag is set),
2933 * n1^2, n2^2 or n1^2 * n2^2.
2937 * The main loop. The procedure is:
2938 * - Advance the windows
2939 * - If the most-significant bit of a window is set,
2940 * schedule a multiply for the appropriate time in the
2941 * future (may be immediately)
2942 * - Perform any pending multiples
2943 * - Check for termination
2944 * - Square the buffers
2946 * At any given time, the acumulated product is held in
2947 * the high half of b.
2952 /* Advance the windows */
2953 assert(buf1
< tblmask
);
2955 assert(buf2
< tblmask
);
2958 * This reads ahead of the current exponent position
2959 * (controlled by e1bits), so we have to be able to read
2960 * past the lsb of the exponents without error.
2963 buf1
|= ((*e1
& bitpos
) != 0);
2965 buf2
|= ((*e2
& bitpos
) != 0);
2968 BIGLITTLE(e1
++,e1
--);
2970 BIGLITTLE(e2
++,e2
--);
2971 bitpos
= (BNWORD64
)1 << (64-1);
2976 /* Examine the first window for pending multiplies */
2977 if (buf1
& tblmask
) {
2978 mult1pos
= e1bits
- w1bits
;
2979 while ((buf1
& 1) == 0) {
2983 /* Intermediates can wrap, but final must NOT */
2984 assert(mult1pos
<= e1bits
);
2985 mult1
= table1
[buf1
>>1];
2990 * Examine the second window for pending multiplies.
2991 * Window 2 can be smaller than window 1, but we
2992 * keep the same number of bits in buf2, so we need
2993 * to ignore any low-order bits in the buffer when
2994 * computing what to multiply by, and recompute them
2997 if (buf2
& tblmask
) {
2998 /* Remember low-order bits for later */
2999 i
= buf2
& ((1u << w2bits
) - 1);
3001 mult2pos
= e1bits
- w1bits
+ w2bits
;
3002 while ((buf2
& 1) == 0) {
3006 assert(mult2pos
<= e1bits
);
3007 mult2
= table2
[buf2
>>1];
3012 /* If we have a pending multiply for e1, do it */
3013 if (e1bits
== mult1pos
) {
3014 /* Multiply by the table entry remembered previously */
3015 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
3017 /* Multiply by 1 is a trivial case */
3018 lbnCopy_64(t
, mult1
, mlen
);
3021 lbnMontMul_64(a
, t
, mult1
, mod
, mlen
, inv
);
3023 t
= a
; a
= b
; b
= t
;
3027 /* If we have a pending multiply for e2, do it */
3028 if (e1bits
== mult2pos
) {
3029 /* Multiply by the table entry remembered previously */
3030 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
3032 /* Multiply by 1 is a trivial case */
3033 lbnCopy_64(t
, mult2
, mlen
);
3036 lbnMontMul_64(a
, t
, mult2
, mod
, mlen
, inv
);
3038 t
= a
; a
= b
; b
= t
;
3046 /* Square the buffer */
3048 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
3049 lbnMontSquare_64(a
, t
, mod
, mlen
, inv
);
3051 t
= a
; a
= b
; b
= t
;
3054 if (bnYield
&& (y
= bnYield()) < 0)
3065 /* Convert result out of Montgomery form */
3066 t
= BIGLITTLE(b
-mlen
, b
+mlen
);
3067 lbnCopy_64(b
, t
, mlen
);
3068 lbnZero_64(t
, mlen
);
3069 lbnMontReduce_64(b
, mod
, mlen
, inv
);
3070 lbnCopy_64(result
, t
, mlen
);
3072 /* Clean up - free intermediate storage */
3077 buf2
= tblmask
>> w2bits
;
3080 LBNFREE(table2
[tblmask
], mlen
);
3081 LBNFREE(table1
[tblmask
], mlen
);
3088 return y
; /* Success */
3092 * 2^exp (mod mod). This is an optimized version for use in Fermat
3093 * tests. The input value of n is ignored; it is returned with
3094 * "mlen" words valid.
3097 lbnTwoExpMod_64(BNWORD64
*n
, BNWORD64
const *exp
, unsigned elen
,
3098 BNWORD64
*mod
, unsigned mlen
)
3100 unsigned e
; /* Copy of high words of the exponent */
3101 unsigned bits
; /* Assorted counter of bits */
3102 BNWORD64
const *bitptr
;
3103 BNWORD64 bitword
, bitpos
;
3104 BNWORD64
*a
, *b
, *a1
;
3106 int y
; /* Result of bnYield() */
3110 bitptr
= BIGLITTLE(exp
-elen
, exp
+elen
-1);
3114 /* Clear n for future use. */
3115 lbnZero_64(n
, mlen
);
3117 bits
= lbnBits_64(exp
, elen
);
3119 /* First, a couple of trivial cases. */
3121 /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
3122 BIGLITTLE(n
[-1],n
[0]) = (BNWORD64
)1<<elen
;
3126 /* Set bitpos to the most significant bit */
3127 bitpos
= (BNWORD64
)1 << ((bits
-1) & (64-1));
3129 /* Now, count the bits in the modulus. */
3130 bits
= lbnBits_64(mod
, mlen
);
3131 assert(bits
> 1); /* a 1-bit modulus is just stupid... */
3134 * We start with 1<<e, where "e" is as many high bits of the
3135 * exponent as we can manage without going over the modulus.
3136 * This first loop finds "e".
3140 /* Consume the first bit */
3145 bitword
= BIGLITTLE(*++bitptr
,*--bitptr
);
3146 bitpos
= (BNWORD64
)1<<(64-1);
3148 e
= (e
<< 1) | ((bitpos
& bitword
) != 0);
3149 if (e
>= bits
) { /* Overflow! Back out. */
3155 * The bit in "bitpos" being examined by the bit buffer has NOT
3156 * been consumed yet. This may be past the end of the exponent,
3157 * in which case elen == 1.
3160 /* Okay, now, set bit "e" in n. n is already zero. */
3161 inv
= (BNWORD64
)1 << (e
& (64-1));
3163 BIGLITTLE(n
[-e
-1],n
[e
]) = inv
;
3165 * The effective length of n in words is now "e+1".
3166 * This is used a little bit later.
3170 return 0; /* That was easy! */
3173 * We have now processed the first few bits. The next step
3174 * is to convert this to Montgomery form for further squaring.
3177 /* Allocate working storage: two product buffers */
3178 LBNALLOC(a
, BNWORD64
, 2*mlen
);
3181 LBNALLOC(b
, BNWORD64
, 2*mlen
);
3187 /* Convert n to Montgomery form */
3188 inv
= BIGLITTLE(mod
[-1],mod
[0]); /* LSW of modulus */
3189 assert(inv
& 1); /* Modulus must be odd */
3190 inv
= lbnMontInv1_64(inv
);
3191 /* Move n (length e+1, remember?) up "mlen" words into b */
3192 /* Note that we lie about a1 for a bit - it's pointing to b */
3193 a1
= BIGLITTLE(b
-mlen
,b
+mlen
);
3194 lbnCopy_64(a1
, n
, e
+1);
3195 lbnZero_64(b
, mlen
);
3196 /* Do the division - dump the quotient into the high-order words */
3197 (void)lbnDiv_64(a1
, b
, mlen
+e
+1, mod
, mlen
);
3199 * Now do the first squaring and modular reduction to put
3200 * the number up in a1 where it belongs.
3202 lbnMontSquare_64(a
, b
, mod
, mlen
, inv
);
3203 /* Fix up a1 to point to where it should go. */
3204 a1
= BIGLITTLE(a
-mlen
,a
+mlen
);
3207 * Okay, now, a1 holds the number being accumulated, and
3208 * b is a scratch register. Start working:
3212 * Is the bit set? If so, double a1 as well.
3213 * A modular doubling like this is very cheap.
3215 if (bitpos
& bitword
) {
3217 * Double the number. If there was a carry out OR
3218 * the result is greater than the modulus, subract
3221 if (lbnDouble_64(a1
, mlen
) ||
3222 lbnCmp_64(a1
, mod
, mlen
) > 0)
3223 (void)lbnSubN_64(a1
, mod
, mlen
);
3226 /* Advance to the next exponent bit */
3231 bitword
= BIGLITTLE(*++bitptr
,*--bitptr
);
3232 bitpos
= (BNWORD64
)1<<(64-1);
3236 * The elen/bitword/bitpos bit buffer is known to be
3237 * non-empty, i.e. there is at least one more unconsumed bit.
3238 * Thus, it's safe to square the number.
3240 lbnMontSquare_64(b
, a1
, mod
, mlen
, inv
);
3241 /* Rename result (in b) back to a (a1, really). */
3242 a1
= b
; b
= a
; a
= a1
;
3243 a1
= BIGLITTLE(a
-mlen
,a
+mlen
);
3245 if (bnYield
&& (y
= bnYield()) < 0)
3250 /* DONE! Just a little bit of cleanup... */
3253 * Convert result out of Montgomery form... this is
3254 * just a Montgomery reduction.
3256 lbnCopy_64(a
, a1
, mlen
);
3257 lbnZero_64(a1
, mlen
);
3258 lbnMontReduce_64(a
, mod
, mlen
, inv
);
3259 lbnCopy_64(n
, a1
, mlen
);
3261 /* Clean up - free intermediate storage */
3269 return y
; /* Success */
3274 * Returns a substring of the big-endian array of bytes representation
3275 * of the bignum array based on two parameters, the least significant
3276 * byte number (0 to start with the least significant byte) and the
3277 * length. I.e. the number returned is a representation of
3278 * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3280 * It is an error if the bignum is not at least buflen + lsbyte bytes
3283 * This code assumes that the compiler has the minimal intelligence
3284 * neded to optimize divides and modulo operations on an unsigned data
3285 * type with a power of two.
3288 lbnExtractBigBytes_64(BNWORD64
const *n
, unsigned char *buf
,
3289 unsigned lsbyte
, unsigned buflen
)
3291 BNWORD64 t
= 0; /* Needed to shut up uninitialized var warnings */
3296 shift
= (8 * lsbyte
) % 64;
3297 lsbyte
/= (64/8); /* Convert to word offset */
3298 BIGLITTLE(n
-= lsbyte
, n
+= lsbyte
);
3301 t
= BIGLITTLE(n
[-1],n
[0]);
3305 t
= BIGLITTLE(*n
++,*--n
);
3309 *buf
++ = (unsigned char)(t
>>shift
);
3314 * Merge a big-endian array of bytes into a bignum array.
3315 * The array had better be big enough. This is
3316 * equivalent to extracting the entire bignum into a
3317 * large byte array, copying the input buffer into the
3318 * middle of it, and converting back to a bignum.
3320 * The buf is "len" bytes long, and its *last* byte is at
3321 * position "lsbyte" from the end of the bignum.
3323 * Note that this is a pain to get right. Fortunately, it's hardly
3324 * critical for efficiency.
3327 lbnInsertBigBytes_64(BNWORD64
*n
, unsigned char const *buf
,
3328 unsigned lsbyte
, unsigned buflen
)
3330 BNWORD64 t
= 0; /* Shut up uninitialized varibale warnings */
3334 BIGLITTLE(n
-= lsbyte
/(64/8), n
+= lsbyte
/(64/8));
3336 /* Load up leading odd bytes */
3337 if (lsbyte
% (64/8)) {
3338 t
= BIGLITTLE(*--n
,*n
++);
3339 t
>>= (lsbyte
* 8) % 64;
3342 /* The main loop - merge into t, storing at each word boundary. */
3344 t
= (t
<< 8) | *buf
++;
3345 if ((--lsbyte
% (64/8)) == 0)
3346 BIGLITTLE(*n
++,*--n
) = t
;
3349 /* Merge odd bytes in t into last word */
3350 lsbyte
= (lsbyte
* 8) % 64;
3353 t
|= (((BNWORD64
)1 << lsbyte
) - 1) & BIGLITTLE(n
[0],n
[-1]);
3354 BIGLITTLE(n
[0],n
[-1]) = t
;
3361 * Returns a substring of the little-endian array of bytes representation
3362 * of the bignum array based on two parameters, the least significant
3363 * byte number (0 to start with the least significant byte) and the
3364 * length. I.e. the number returned is a representation of
3365 * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3367 * It is an error if the bignum is not at least buflen + lsbyte bytes
3370 * This code assumes that the compiler has the minimal intelligence
3371 * neded to optimize divides and modulo operations on an unsigned data
3372 * type with a power of two.
3375 lbnExtractLittleBytes_64(BNWORD64
const *n
, unsigned char *buf
,
3376 unsigned lsbyte
, unsigned buflen
)
3378 BNWORD64 t
= 0; /* Needed to shut up uninitialized var warnings */
3380 BIGLITTLE(n
-= lsbyte
/(64/8), n
+= lsbyte
/(64/8));
3382 if (lsbyte
% (64/8)) {
3383 t
= BIGLITTLE(*--n
,*n
++);
3384 t
>>= (lsbyte
% (64/8)) * 8 ;
3388 if ((lsbyte
++ % (64/8)) == 0)
3389 t
= BIGLITTLE(*--n
,*n
++);
3390 *buf
++ = (unsigned char)t
;
3396 * Merge a little-endian array of bytes into a bignum array.
3397 * The array had better be big enough. This is
3398 * equivalent to extracting the entire bignum into a
3399 * large byte array, copying the input buffer into the
3400 * middle of it, and converting back to a bignum.
3402 * The buf is "len" bytes long, and its first byte is at
3403 * position "lsbyte" from the end of the bignum.
3405 * Note that this is a pain to get right. Fortunately, it's hardly
3406 * critical for efficiency.
3409 lbnInsertLittleBytes_64(BNWORD64
*n
, unsigned char const *buf
,
3410 unsigned lsbyte
, unsigned buflen
)
3412 BNWORD64 t
= 0; /* Shut up uninitialized varibale warnings */
3414 /* Move to most-significant end */
3418 BIGLITTLE(n
-= lsbyte
/(64/8), n
+= lsbyte
/(64/8));
3420 /* Load up leading odd bytes */
3421 if (lsbyte
% (64/8)) {
3422 t
= BIGLITTLE(*--n
,*n
++);
3423 t
>>= (lsbyte
* 8) % 64;
3426 /* The main loop - merge into t, storing at each word boundary. */
3428 t
= (t
<< 8) | *--buf
;
3429 if ((--lsbyte
% (64/8)) == 0)
3430 BIGLITTLE(*n
++,*--n
) = t
;
3433 /* Merge odd bytes in t into last word */
3434 lsbyte
= (lsbyte
* 8) % 64;
3437 t
|= (((BNWORD64
)1 << lsbyte
) - 1) & BIGLITTLE(n
[0],n
[-1]);
3438 BIGLITTLE(n
[0],n
[-1]) = t
;
3444 #ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
3446 * Convert a big-endian array of bytes to a bignum.
3447 * Returns the number of words in the bignum.
3448 * Note the expression "64/8" for the number of bytes per word.
3449 * This is so the word-size adjustment will work.
3452 lbnFromBytes_64(BNWORD64
*a
, unsigned char const *b
, unsigned blen
)
3455 unsigned alen
= (blen
+ (64/8-1))/(64/8);
3456 BIGLITTLE(a
-= alen
, a
+= alen
);
3462 } while (--blen
& (64/8-1));
3463 BIGLITTLE(*a
++,*--a
) = t
;
3470 * Computes the GCD of a and b. Modifies both arguments; when it returns,
3471 * one of them is the GCD and the other is trash. The return value
3472 * indicates which: 0 for a, and 1 for b. The length of the retult is
3473 * returned in rlen. Both inputs must have one extra word of precision.
3474 * alen must be >= blen.
3476 * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
3477 * This is based on taking out common powers of 2, then repeatedly:
3478 * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
3479 * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
3480 * It gets less reduction per step, but the steps are much faster than
3481 * the division case.
3484 lbnGcd_64(BNWORD64
*a
, unsigned alen
, BNWORD64
*b
, unsigned blen
,
3490 assert(alen
>= blen
);
3493 (void)lbnDiv_64(BIGLITTLE(a
-blen
,a
+blen
), a
, alen
, b
, blen
);
3494 alen
= lbnNorm_64(a
, blen
);
3499 (void)lbnDiv_64(BIGLITTLE(b
-alen
,b
+alen
), b
, blen
, a
, alen
);
3500 blen
= lbnNorm_64(b
, alen
);
3502 if (bnYield
&& (y
= bnYield()) < 0)
3511 * Invert "a" modulo "mod" using the extended Euclidean algorithm.
3512 * Note that this only computes one of the cosequences, and uses the
3513 * theorem that the signs flip every step and the absolute value of
3514 * the cosequence values are always bounded by the modulus to avoid
3515 * having to work with negative numbers.
3516 * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
3517 * a must be one word longer than "mod". It is overwritten with the
3519 * TODO: Use Richard Schroeppel's *much* faster algorithm.
3522 lbnInv_64(BNWORD64
*a
, unsigned alen
, BNWORD64
const *mod
, unsigned mlen
)
3524 BNWORD64
*b
; /* Hold a copy of mod during GCD reduction */
3525 BNWORD64
*p
; /* Temporary for products added to t0 and t1 */
3526 BNWORD64
*t0
, *t1
; /* Inverse accumulators */
3528 unsigned blen
, t0len
, t1len
, plen
;
3531 alen
= lbnNorm_64(a
, alen
);
3533 return 1; /* No inverse */
3535 mlen
= lbnNorm_64(mod
, mlen
);
3537 assert (alen
<= mlen
);
3539 /* Inverse of 1 is 1 */
3540 if (alen
== 1 && BIGLITTLE(a
[-1],a
[0]) == 1) {
3541 lbnZero_64(BIGLITTLE(a
-alen
,a
+alen
), mlen
-alen
);
3545 /* Allocate a pile of space */
3546 LBNALLOC(b
, BNWORD64
, mlen
+1);
3549 * Although products are guaranteed to always be less than the
3550 * modulus, it can involve multiplying two 3-word numbers to
3551 * get a 5-word result, requiring a 6th word to store a 0
3552 * temporarily. Thus, mlen + 1.
3554 LBNALLOC(p
, BNWORD64
, mlen
+1);
3556 LBNALLOC(t0
, BNWORD64
, mlen
);
3558 LBNALLOC(t1
, BNWORD64
, mlen
);
3573 BIGLITTLE(t0
[-1],t0
[0]) = 1;
3576 lbnCopy_64(b
, mod
, mlen
);
3577 /* blen = mlen (implicitly) */
3579 /* t1 = b / a; b = b % a */
3580 cy
= lbnDiv_64(t1
, b
, mlen
, a
, alen
);
3581 *(BIGLITTLE(t1
-(mlen
-alen
)-1,t1
+(mlen
-alen
))) = cy
;
3582 t1len
= lbnNorm_64(t1
, mlen
-alen
+1);
3583 blen
= lbnNorm_64(b
, alen
);
3586 while (blen
> 1 || BIGLITTLE(b
[-1],b
[0]) != (BNWORD64
)1) {
3587 /* q = a / b; a = a % b; */
3588 if (alen
< blen
|| (alen
== blen
&& lbnCmp_64(a
, a
, alen
) < 0))
3590 cy
= lbnDiv_64(BIGLITTLE(a
-blen
,a
+blen
), a
, alen
, b
, blen
);
3591 *(BIGLITTLE(a
-alen
-1,a
+alen
)) = cy
;
3592 plen
= lbnNorm_64(BIGLITTLE(a
-blen
,a
+blen
), alen
-blen
+1);
3594 alen
= lbnNorm_64(a
, blen
);
3596 goto failure
; /* GCD not 1 */
3599 assert(plen
+t1len
<= mlen
+1);
3600 lbnMul_64(p
, BIGLITTLE(a
-blen
,a
+blen
), plen
, t1
, t1len
);
3601 plen
= lbnNorm_64(p
, plen
+ t1len
);
3602 assert(plen
<= mlen
);
3604 lbnZero_64(BIGLITTLE(t0
-t0len
,t0
+t0len
), plen
-t0len
);
3607 cy
= lbnAddN_64(t0
, p
, plen
);
3610 cy
= lbnAdd1_64(BIGLITTLE(t0
-plen
,t0
+plen
),
3614 BIGLITTLE(t0
[-t0len
-1],t0
[t0len
]) = cy
;
3619 /* if (a <= 1) return a ? t0 : FAIL; */
3620 if (alen
<= 1 && BIGLITTLE(a
[-1],a
[0]) == (BNWORD64
)1) {
3622 goto failure
; /* FAIL */
3623 assert(t0len
<= mlen
);
3624 lbnCopy_64(a
, t0
, t0len
);
3625 lbnZero_64(BIGLITTLE(a
-t0len
, a
+t0len
), mlen
-t0len
);
3629 /* q = b / a; b = b % a; */
3630 if (blen
< alen
|| (blen
== alen
&& lbnCmp_64(b
, a
, alen
) < 0))
3632 cy
= lbnDiv_64(BIGLITTLE(b
-alen
,b
+alen
), b
, blen
, a
, alen
);
3633 *(BIGLITTLE(b
-blen
-1,b
+blen
)) = cy
;
3634 plen
= lbnNorm_64(BIGLITTLE(b
-alen
,b
+alen
), blen
-alen
+1);
3636 blen
= lbnNorm_64(b
, alen
);
3638 goto failure
; /* GCD not 1 */
3641 assert(plen
+t0len
<= mlen
+1);
3642 lbnMul_64(p
, BIGLITTLE(b
-alen
,b
+alen
), plen
, t0
, t0len
);
3643 plen
= lbnNorm_64(p
, plen
+ t0len
);
3644 assert(plen
<= mlen
);
3646 lbnZero_64(BIGLITTLE(t1
-t1len
,t1
+t1len
), plen
-t1len
);
3649 cy
= lbnAddN_64(t1
, p
, plen
);
3652 cy
= lbnAdd1_64(BIGLITTLE(t1
-plen
,t0
+plen
),
3656 BIGLITTLE(t1
[-t1len
-1],t1
[t1len
]) = cy
;
3661 if (bnYield
&& (y
= bnYield() < 0))
3667 goto failure
; /* gcd(a, mod) != 1 -- FAIL */
3670 lbnCopy_64(a
, mod
, mlen
);
3671 assert(t1len
<= mlen
);
3672 cy
= lbnSubN_64(a
, t1
, t1len
);
3674 assert(mlen
> t1len
);
3675 cy
= lbnSub1_64(BIGLITTLE(a
-t1len
, a
+t1len
), mlen
-t1len
, cy
);
3687 failure
: /* GCD is not 1 - no inverse exists! */
3701 * Precompute powers of "a" mod "mod". Compute them every "bits"
3702 * for "n" steps. This is sufficient to compute powers of g with
3703 * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
3705 * This assumes that the caller has already initialized "array" to point
3706 * to "n" buffers of size "mlen".
3709 lbnBasePrecompBegin_64(BNWORD64
**array
, unsigned n
, unsigned bits
,
3710 BNWORD64
const *g
, unsigned glen
, BNWORD64
*mod
, unsigned mlen
)
3712 BNWORD64
*a
, *b
; /* Temporary double-width accumulators */
3713 BNWORD64
*a1
; /* Pointer to high half of a*/
3714 BNWORD64 inv
; /* Montgomery inverse of LSW of mod */
3718 glen
= lbnNorm_64(g
, glen
);
3721 assert (mlen
== lbnNorm_64(mod
, mlen
));
3722 assert (glen
<= mlen
);
3724 /* Allocate two temporary buffers, and the array slots */
3725 LBNALLOC(a
, BNWORD64
, mlen
*2);
3728 LBNALLOC(b
, BNWORD64
, mlen
*2);
3734 /* Okay, all ready */
3736 /* Convert n to Montgomery form */
3737 inv
= BIGLITTLE(mod
[-1],mod
[0]); /* LSW of modulus */
3738 assert(inv
& 1); /* Modulus must be odd */
3739 inv
= lbnMontInv1_64(inv
);
3740 /* Move g up "mlen" words into a (clearing the low mlen words) */
3741 a1
= BIGLITTLE(a
-mlen
,a
+mlen
);
3742 lbnCopy_64(a1
, g
, glen
);
3743 lbnZero_64(a
, mlen
);
3745 /* Do the division - dump the quotient into the high-order words */
3746 (void)lbnDiv_64(a1
, a
, mlen
+glen
, mod
, mlen
);
3748 /* Copy the first value into the array */
3750 lbnCopy_64(t
, a
, mlen
);
3751 a1
= a
; /* This first value is *not* shifted up */
3753 /* Now compute the remaining n-1 array entries */
3759 /* Square a1 into b1 */
3760 lbnMontSquare_64(b
, a1
, mod
, mlen
, inv
);
3761 t
= b
; b
= a
; a
= t
;
3762 a1
= BIGLITTLE(a
-mlen
, a
+mlen
);
3765 lbnCopy_64(t
, a1
, mlen
);
3768 /* Hooray, we're done. */
3775 * result = base^exp (mod mod). "array" is a an array of pointers
3776 * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
3777 * is base^(2^(i*bits))).
3779 * The algorithm consists of:
3780 * a = b = (powers of g to be raised to the power 2^bits-1)
3781 * a *= b *= (powers of g to be raised to the power 2^bits-2)
3783 * a *= b *= (powers of g to be raised to the power 1)
3785 * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3788 lbnBasePrecompExp_64(BNWORD64
*result
, BNWORD64
const * const *array
,
3789 unsigned bits
, BNWORD64
const *exp
, unsigned elen
,
3790 BNWORD64
const *mod
, unsigned mlen
)
3792 BNWORD64
*a
, *b
, *c
, *t
;
3794 int anull
, bnull
; /* Null flags: values are implicitly 1 */
3795 unsigned i
, j
; /* Loop counters */
3796 unsigned mask
; /* Exponent bits to examime */
3797 BNWORD64
const *eptr
; /* Pointer into exp */
3798 BNWORD64 buf
, curbits
, nextword
; /* Bit-buffer varaibles */
3799 BNWORD64 inv
; /* Inverse of LSW of modulus */
3800 unsigned ewords
; /* Words of exponent left */
3801 int bufbits
; /* Number of valid bits */
3804 mlen
= lbnNorm_64(mod
, mlen
);
3807 elen
= lbnNorm_64(exp
, elen
);
3809 lbnZero_64(result
, mlen
);
3810 BIGLITTLE(result
[-1],result
[0]) = 1;
3814 * This could be precomputed, but it's so cheap, and it would require
3815 * making the precomputation structure word-size dependent.
3817 inv
= lbnMontInv1_64(mod
[BIGLITTLE(-1,0)]); /* LSW of modulus */
3822 * Allocate three temporary buffers. The current numbers generally
3823 * live in the upper halves of these buffers.
3825 LBNALLOC(a
, BNWORD64
, mlen
*2);
3827 LBNALLOC(b
, BNWORD64
, mlen
*2);
3829 LBNALLOC(c
, BNWORD64
, mlen
*2);
3842 mask
= (1u<<bits
) - 1;
3843 for (i
= mask
; i
; --i
) {
3844 /* Set up bit buffer for walking the exponent */
3846 buf
= BIGLITTLE(*--eptr
, *eptr
++);
3849 for (j
= 0; ewords
|| buf
; j
++) {
3850 /* Shift down current buffer */
3853 /* If necessary, add next word */
3855 if (bufbits
< 0 && ewords
> 0) {
3856 nextword
= BIGLITTLE(*--eptr
, *eptr
++);
3858 curbits
|= nextword
<< (bufbits
+bits
);
3859 buf
= nextword
>> -bufbits
;
3862 /* If appropriate, multiply b *= array[j] */
3863 if ((curbits
& mask
) == i
) {
3864 BNWORD64
const *d
= array
[j
];
3866 b1
= BIGLITTLE(b
-mlen
-1,b
+mlen
);
3868 lbnCopy_64(b1
, d
, mlen
);
3871 lbnMontMul_64(c
, b1
, d
, mod
, mlen
, inv
);
3872 t
= c
; c
= b
; b
= t
;
3875 if (bnYield
&& (y
= bnYield() < 0))
3881 /* Multiply a *= b */
3883 a1
= BIGLITTLE(a
-mlen
-1,a
+mlen
);
3884 b1
= BIGLITTLE(b
-mlen
-1,b
+mlen
);
3886 lbnCopy_64(a1
, b1
, mlen
);
3889 lbnMontMul_64(c
, a1
, b1
, mod
, mlen
, inv
);
3890 t
= c
; c
= a
; a
= t
;
3895 assert(!anull
); /* If it were, elen would have been 0 */
3897 /* Convert out of Montgomery form and return */
3898 a1
= BIGLITTLE(a
-mlen
-1,a
+mlen
);
3899 lbnCopy_64(a
, a1
, mlen
);
3900 lbnZero_64(a1
, mlen
);
3901 lbnMontReduce_64(a
, mod
, mlen
, inv
);
3902 lbnCopy_64(result
, a1
, mlen
);
3915 * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
3916 * arrays of pointers to procomputed powers of the corresponding bases,
3917 * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
3919 * Bits must be the same in both. (It could be made adjustable, but it's
3920 * a bit of a pain. Just make them both equal to the larger one.)
3922 * The algorithm consists of:
3923 * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
3924 * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
3926 * a *= b *= (powers of base1 and base2 to be raised to the power 1)
3928 * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3931 lbnDoubleBasePrecompExp_64(BNWORD64
*result
, unsigned bits
,
3932 BNWORD64
const * const *array1
, BNWORD64
const *exp1
, unsigned elen1
,
3933 BNWORD64
const * const *array2
, BNWORD64
const *exp2
,
3934 unsigned elen2
, BNWORD64
const *mod
, unsigned mlen
)
3936 BNWORD64
*a
, *b
, *c
, *t
;
3938 int anull
, bnull
; /* Null flags: values are implicitly 1 */
3939 unsigned i
, j
, k
; /* Loop counters */
3940 unsigned mask
; /* Exponent bits to examime */
3941 BNWORD64
const *eptr
; /* Pointer into exp */
3942 BNWORD64 buf
, curbits
, nextword
; /* Bit-buffer varaibles */
3943 BNWORD64 inv
; /* Inverse of LSW of modulus */
3944 unsigned ewords
; /* Words of exponent left */
3945 int bufbits
; /* Number of valid bits */
3947 BNWORD64
const * const *array
;
3949 mlen
= lbnNorm_64(mod
, mlen
);
3952 elen1
= lbnNorm_64(exp1
, elen1
);
3954 return lbnBasePrecompExp_64(result
, array2
, bits
, exp2
, elen2
,
3957 elen2
= lbnNorm_64(exp2
, elen2
);
3959 return lbnBasePrecompExp_64(result
, array1
, bits
, exp1
, elen1
,
3963 * This could be precomputed, but it's so cheap, and it would require
3964 * making the precomputation structure word-size dependent.
3966 inv
= lbnMontInv1_64(mod
[BIGLITTLE(-1,0)]); /* LSW of modulus */
3972 * Allocate three temporary buffers. The current numbers generally
3973 * live in the upper halves of these buffers.
3975 LBNALLOC(a
, BNWORD64
, mlen
*2);
3977 LBNALLOC(b
, BNWORD64
, mlen
*2);
3979 LBNALLOC(c
, BNWORD64
, mlen
*2);
3992 mask
= (1u<<bits
) - 1;
3993 for (i
= mask
; i
; --i
) {
3994 /* Walk each exponent in turn */
3995 for (k
= 0; k
< 2; k
++) {
3996 /* Set up the exponent for walking */
3997 array
= k
? array2
: array1
;
3998 eptr
= k
? exp2
: exp1
;
3999 ewords
= (k
? elen2
: elen1
) - 1;
4000 /* Set up bit buffer for walking the exponent */
4001 buf
= BIGLITTLE(*--eptr
, *eptr
++);
4003 for (j
= 0; ewords
|| buf
; j
++) {
4004 /* Shift down current buffer */
4007 /* If necessary, add next word */
4009 if (bufbits
< 0 && ewords
> 0) {
4010 nextword
= BIGLITTLE(*--eptr
, *eptr
++);
4012 curbits
|= nextword
<< (bufbits
+bits
);
4013 buf
= nextword
>> -bufbits
;
4016 /* If appropriate, multiply b *= array[j] */
4017 if ((curbits
& mask
) == i
) {
4018 BNWORD64
const *d
= array
[j
];
4020 b1
= BIGLITTLE(b
-mlen
-1,b
+mlen
);
4022 lbnCopy_64(b1
, d
, mlen
);
4025 lbnMontMul_64(c
, b1
, d
, mod
, mlen
, inv
);
4026 t
= c
; c
= b
; b
= t
;
4029 if (bnYield
&& (y
= bnYield() < 0))
4036 /* Multiply a *= b */
4038 a1
= BIGLITTLE(a
-mlen
-1,a
+mlen
);
4039 b1
= BIGLITTLE(b
-mlen
-1,b
+mlen
);
4041 lbnCopy_64(a1
, b1
, mlen
);
4044 lbnMontMul_64(c
, a1
, b1
, mod
, mlen
, inv
);
4045 t
= c
; c
= a
; a
= t
;
4050 assert(!anull
); /* If it were, elen would have been 0 */
4052 /* Convert out of Montgomery form and return */
4053 a1
= BIGLITTLE(a
-mlen
-1,a
+mlen
);
4054 lbnCopy_64(a
, a1
, mlen
);
4055 lbnZero_64(a1
, mlen
);
4056 lbnMontReduce_64(a
, mod
, mlen
, inv
);
4057 lbnCopy_64(result
, a1
, mlen
);