libs/libzrtp/third_party/bnlib/lbn64.c

   1 /*
   2  * Copyright (c) 1995  Colin Plumb.  All rights reserved.
   3  * For licensing and other legal details, see the file legal.c.
   4  *
   5  * lbn64.c - Low-level bignum routines, 64-bit version.
   6  *
   7  * NOTE: the magic constants "64" and "128" appear in many places in this
   8  * file, including inside identifiers.  Because it is not possible to
   9  * ask "#ifdef" of a macro expansion, it is not possible to use the
  10  * preprocessor to conditionalize these properly.  Thus, this file is
  11  * intended to be edited with textual search and replace to produce
  12  * alternate word size versions.  Any reference to the number of bits
  13  * in a word must be the string "64", and that string must not appear
  14  * otherwise.  Any reference to twice this number must appear as "128",
  15  * which likewise must not appear otherwise.  Is that clear?
  16  *
  17  * Remember, when doubling the bit size replace the larger number (128)
  18  * first, then the smaller (64).  When halving the bit size, do the
  19  * opposite.  Otherwise, things will get wierd.  Also, be sure to replace
  20  * every instance that appears.  (:%s/foo/bar/g in vi)
  21  *
  22  * These routines work with a pointer to the least-significant end of
  23  * an array of WORD64s.  The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
  24  * defined in lbn.h (which expand to x on a big-edian machine and y on a
  25  * little-endian machine) are used to conditionalize the code to work
  26  * either way.  If you have no assembly primitives, it doesn't matter.
  27  * Note that on a big-endian machine, the least-significant-end pointer
  28  * is ONE PAST THE END.  The bytes are ptr[-1] through ptr[-len].
  29  * On little-endian, they are ptr[0] through ptr[len-1].  This makes
  30  * perfect sense if you consider pointers to point *between* bytes rather
  31  * than at them.
  32  *
  33  * Because the array index values are unsigned integers, ptr[-i]
  34  * may not work properly, since the index -i is evaluated as an unsigned,
  35  * and if pointers are wider, zero-extension will produce a positive
  36  * number rahter than the needed negative.  The expression used in this
  37  * code, *(ptr-i) will, however, work.  (The array syntax is equivalent
  38  * to *(ptr+-i), which is a pretty subtle difference.)
  39  *
  40  * Many of these routines will get very unhappy if fed zero-length inputs.
  41  * They use assert() to enforce this.  An higher layer of code must make
  42  * sure that these aren't called with zero-length inputs.
  43  *
  44  * Any of these routines can be replaced with more efficient versions
  45  * elsewhere, by just #defining their names.  If one of the names
  46  * is #defined, the C code is not compiled in and no declaration is
  47  * made.  Use the BNINCLUDE file to do that.  Typically, you compile
  48  * asm subroutines with the same name and just, e.g.
  49  * #define lbnMulAdd1_64 lbnMulAdd1_64
  50  *
  51  * If you want to write asm routines, start with lbnMulAdd1_64().
  52  * This is the workhorse of modular exponentiation.  lbnMulN1_64() is
  53  * also used a fair bit, although not as much and it's defined in terms
  54  * of lbnMulAdd1_64 if that has a custom version.  lbnMulSub1_64 and
  55  * lbnDiv21_64 are used in the usual division and remainder finding.
  56  * (Not the Montgomery reduction used in modular exponentiation, though.)
  57  * Once you have lbnMulAdd1_64 defined, writing the other two should
  58  * be pretty easy.  (Just make sure you get the sign of the subtraction
  59  * in lbnMulSub1_64 right - it's dest = dest - source * k.)
  60  *
  61  * The only definitions that absolutely need a double-word (BNWORD128)
  62  * type are lbnMulAdd1_64 and lbnMulSub1_64; if those are provided,
  63  * the rest follows.  lbnDiv21_64, however, is a lot slower unless you
  64  * have them, and lbnModQ_64 takes after it.  That one is used quite a
  65  * bit for prime sieving.
  66  */
  67
  68 #ifndef HAVE_CONFIG_H
  69 #define HAVE_CONFIG_H 0
  70 #endif
  71 #if HAVE_CONFIG_H
  72 #include "bnconfig.h"
  73 #endif
  74
  75 /*
  76  * Some compilers complain about #if FOO if FOO isn't defined,
  77  * so do the ANSI-mandated thing explicitly...
  78  */
  79 #ifndef NO_ASSERT_H
  80 #define NO_ASSERT_H 0
  81 #endif
  82 #ifndef NO_STRING_H
  83 #define NO_STRING_H 0
  84 #endif
  85 #ifndef HAVE_STRINGS_H
  86 #define HAVE_STRINGS_H 0
  87 #endif
  88
  89 #if !NO_ASSERT_H
  90 #include <assert.h>
  91 #else
  92 #define assert(x) (void)0
  93 #endif
  94
  95 #if !NO_STRING_H
  96 #include <string.h>     /* For memcpy */
  97 #elif HAVE_STRINGS_H
  98 #include <strings.h>
  99 #endif
 100
 101 #include "lbn.h"
 102 #include "lbn64.h"
 103 #include "lbnmem.h"
 104
 105 #include "kludge.h"
 106
 107 #ifndef BNWORD64
 108 #error 64-bit bignum library requires a 64-bit data type
 109 #endif
 110
 111 /* If this is defined, include bnYield() calls */
 112 #if BNYIELD
 113 extern int (*bnYield)(void);    /* From bn.c */
 114 #endif
 115
 116 /*
 117  * Most of the multiply (and Montgomery reduce) routines use an outer
 118  * loop that iterates over one of the operands - a so-called operand
 119  * scanning approach.  One big advantage of this is that the assembly
 120  * support routines are simpler.  The loops can be rearranged to have
 121  * an outer loop that iterates over the product, a so-called product
 122  * scanning approach.  This has the advantage of writing less data
 123  * and doing fewer adds to memory, so is supposedly faster.  Some
 124  * code has been written using a product-scanning approach, but
 125  * it appears to be slower, so it is turned off by default.  Some
 126  * experimentation would be appreciated.
 127  *
 128  * (The code is also annoying to get right and not very well commented,
 129  * one of my pet peeves about math libraries.  I'm sorry.)
 130  */
 131 #ifndef PRODUCT_SCAN
 132 #define PRODUCT_SCAN 0
 133 #endif
 134
 135 /*
 136  * Copy an array of words.  <Marvin mode on>  Thrilling, isn't it? </Marvin>
 137  * This is a good example of how the byte offsets and BIGLITTLE() macros work.
 138  * Another alternative would have been
 139  * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD64)), but I find that
 140  * putting operators into conditional macros is confusing.
 141  */
 142 #ifndef lbnCopy_64
 143 void
 144 lbnCopy_64(BNWORD64 *dest, BNWORD64 const *src, unsigned len)
 145 {
 146         memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
 147                len * sizeof(*src));
 148 }
 149 #endif /* !lbnCopy_64 */
 150
 151 /*
 152  * Fill n words with zero.  This does it manually rather than calling
 153  * memset because it can assume alignment to make things faster while
 154  * memset can't.  Note how big-endian numbers are naturally addressed
 155  * using predecrement, while little-endian is postincrement.
 156  */
 157 #ifndef lbnZero_64
 158 void
 159 lbnZero_64(BNWORD64 *num, unsigned len)
 160 {
 161         while (len--)
 162                 BIGLITTLE(*--num,*num++) = 0;
 163 }
 164 #endif /* !lbnZero_64 */
 165
 166 /*
 167  * Negate an array of words.
 168  * Negation is subtraction from zero.  Negating low-order words
 169  * entails doing nothing until a non-zero word is hit.  Once that
 170  * is negated, a borrow is generated and never dies until the end
 171  * of the number is hit.  Negation with borrow, -x-1, is the same as ~x.
 172  * Repeat that until the end of the number.
 173  *
 174  * Doesn't return borrow out because that's pretty useless - it's
 175  * always set unless the input is 0, which is easy to notice in
 176  * normalized form.
 177  */
 178 #ifndef lbnNeg_64
 179 void
 180 lbnNeg_64(BNWORD64 *num, unsigned len)
 181 {
 182         assert(len);
 183
 184         /* Skip low-order zero words */
 185         while (BIGLITTLE(*--num,*num) == 0) {
 186                 if (!--len)
 187                         return;
 188                 LITTLE(num++;)
 189         }
 190         /* Negate the lowest-order non-zero word */
 191         *num = -*num;
 192         /* Complement all the higher-order words */
 193         while (--len) {
 194                 BIGLITTLE(--num,++num);
 195                 *num = ~*num;
 196         }
 197 }
 198 #endif /* !lbnNeg_64 */
 199
 200
 201 /*
 202  * lbnAdd1_64: add the single-word "carry" to the given number.
 203  * Used for minor increments and propagating the carry after
 204  * adding in a shorter bignum.
 205  *
 206  * Technique: If we have a double-width word, presumably the compiler
 207  * can add using its carry in inline code, so we just use a larger
 208  * accumulator to compute the carry from the first addition.
 209  * If not, it's more complex.  After adding the first carry, which may
 210  * be > 1, compare the sum and the carry.  If the sum wraps (causing a
 211  * carry out from the addition), the result will be less than each of the
 212  * inputs, since the wrap subtracts a number (2^64) which is larger than
 213  * the other input can possibly be.  If the sum is >= the carry input,
 214  * return success immediately.
 215  * In either case, if there is a carry, enter a loop incrementing words
 216  * until one does not wrap.  Since we are adding 1 each time, the wrap
 217  * will be to 0 and we can test for equality.
 218  */
 219 #ifndef lbnAdd1_64      /* If defined, it's provided as an asm subroutine */
 220 #ifdef BNWORD128
 221 BNWORD64
 222 lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
 223 {
 224         BNWORD128 t;
 225         assert(len > 0);        /* Alternative: if (!len) return carry */
 226
 227         t = (BNWORD128)BIGLITTLE(*--num,*num) + carry;
 228         BIGLITTLE(*num,*num++) = (BNWORD64)t;
 229         if ((t >> 64) == 0)
 230                 return 0;
 231         while (--len) {
 232                 if (++BIGLITTLE(*--num,*num++) != 0)
 233                         return 0;
 234         }
 235         return 1;
 236 }
 237 #else /* no BNWORD128 */
 238 BNWORD64
 239 lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
 240 {
 241         assert(len > 0);        /* Alternative: if (!len) return carry */
 242
 243         if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
 244                 return 0;
 245         while (--len) {
 246                 if (++BIGLITTLE(*--num,*num++) != 0)
 247                         return 0;
 248         }
 249         return 1;
 250 }
 251 #endif
 252 #endif/* !lbnAdd1_64 */
 253
 254 /*
 255  * lbnSub1_64: subtract the single-word "borrow" from the given number.
 256  * Used for minor decrements and propagating the borrow after
 257  * subtracting a shorter bignum.
 258  *
 259  * Technique: Similar to the add, above.  If there is a double-length type,
 260  * use that to generate the first borrow.
 261  * If not, after subtracting the first borrow, which may be > 1, compare
 262  * the difference and the *negative* of the carry.  If the subtract wraps
 263  * (causing a borrow out from the subtraction), the result will be at least
 264  * as large as -borrow.  If the result < -borrow, then no borrow out has
 265  * appeared and we may return immediately, except when borrow == 0.  To
 266  * deal with that case, use the identity that -x = ~x+1, and instead of
 267  * comparing < -borrow, compare for <= ~borrow.
 268  * Either way, if there is a borrow out, enter a loop decrementing words
 269  * until a non-zero word is reached.
 270  *
 271  * Note the cast of ~borrow to (BNWORD64).  If the size of an int is larger
 272  * than BNWORD64, C rules say the number is expanded for the arithmetic, so
 273  * the inversion will be done on an int and the value won't be quite what
 274  * is expected.
 275  */
 276 #ifndef lbnSub1_64      /* If defined, it's provided as an asm subroutine */
 277 #ifdef BNWORD128
 278 BNWORD64
 279 lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
 280 {
 281         BNWORD128 t;
 282         assert(len > 0);        /* Alternative: if (!len) return borrow */
 283
 284         t = (BNWORD128)BIGLITTLE(*--num,*num) - borrow;
 285         BIGLITTLE(*num,*num++) = (BNWORD64)t;
 286         if ((t >> 64) == 0)
 287                 return 0;
 288         while (--len) {
 289                 if ((BIGLITTLE(*--num,*num++))-- != 0)
 290                         return 0;
 291         }
 292         return 1;
 293 }
 294 #else /* no BNWORD128 */
 295 BNWORD64
 296 lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
 297 {
 298         assert(len > 0);        /* Alternative: if (!len) return borrow */
 299
 300         if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD64)~borrow)
 301                 return 0;
 302         while (--len) {
 303                 if ((BIGLITTLE(*--num,*num++))-- != 0)
 304                         return 0;
 305         }
 306         return 1;
 307 }
 308 #endif
 309 #endif /* !lbnSub1_64 */
 310
 311 /*
 312  * lbnAddN_64: add two bignums of the same length, returning the carry (0 or 1).
 313  * One of the building blocks, along with lbnAdd1, of adding two bignums of
 314  * differing lengths.
 315  *
 316  * Technique: Maintain a word of carry.  If there is no double-width type,
 317  * use the same technique as in lbnAdd1, above, to maintain the carry by
 318  * comparing the inputs.  Adding the carry sources is used as an OR operator;
 319  * at most one of the two comparisons can possibly be true.  The first can
 320  * only be true if carry == 1 and x, the result, is 0.  In that case the
 321  * second can't possibly be true.
 322  */
 323 #ifndef lbnAddN_64
 324 #ifdef BNWORD128
 325 BNWORD64
 326 lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
 327 {
 328         BNWORD128 t;
 329
 330         assert(len > 0);
 331
 332         t = (BNWORD128)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
 333         BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
 334         while (--len) {
 335                 t = (BNWORD128)BIGLITTLE(*--num1,*num1) +
 336                     (BNWORD128)BIGLITTLE(*--num2,*num2++) + (t >> 64);
 337                 BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
 338         }
 339
 340         return (BNWORD64)(t>>64);
 341 }
 342 #else /* no BNWORD128 */
 343 BNWORD64
 344 lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
 345 {
 346         BNWORD64 x, carry = 0;
 347
 348         assert(len > 0);        /* Alternative: change loop to test at start */
 349
 350         do {
 351                 x = BIGLITTLE(*--num2,*num2++);
 352                 carry = (x += carry) < carry;
 353                 carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
 354         } while (--len);
 355
 356         return carry;
 357 }
 358 #endif
 359 #endif /* !lbnAddN_64 */
 360
 361 /*
 362  * lbnSubN_64: add two bignums of the same length, returning the carry (0 or 1).
 363  * One of the building blocks, along with subn1, of subtracting two bignums of
 364  * differing lengths.
 365  *
 366  * Technique: If no double-width type is availble, maintain a word of borrow.
 367  * First, add the borrow to the subtrahend (did you have to learn all those
 368  * awful words in elementary school, too?), and if it overflows, set the
 369  * borrow again.  Then subtract the modified subtrahend from the next word
 370  * of input, using the same technique as in subn1, above.
 371  * Adding the borrows is used as an OR operator; at most one of the two
 372  * comparisons can possibly be true.  The first can only be true if
 373  * borrow == 1 and x, the result, is 0.  In that case the second can't
 374  * possibly be true.
 375  *
 376  * In the double-word case, (BNWORD64)-(t>>64) is subtracted, rather than
 377  * adding t>>64, because the shift would need to sign-extend and that's
 378  * not guaranteed to happen in ANSI C, even with signed types.
 379  */
 380 #ifndef lbnSubN_64
 381 #ifdef BNWORD128
 382 BNWORD64
 383 lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
 384 {
 385         BNWORD128 t;
 386
 387         assert(len > 0);
 388
 389         t = (BNWORD128)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
 390         BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
 391
 392         while (--len) {
 393                 t = (BNWORD128)BIGLITTLE(*--num1,*num1) -
 394                     (BNWORD128)BIGLITTLE(*--num2,*num2++) - (BNWORD64)-(t >> 64);
 395                 BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
 396         }
 397
 398         return -(BNWORD64)(t>>64);
 399 }
 400 #else
 401 BNWORD64
 402 lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
 403 {
 404         BNWORD64 x, borrow = 0;
 405
 406         assert(len > 0);        /* Alternative: change loop to test at start */
 407
 408         do {
 409                 x = BIGLITTLE(*--num2,*num2++);
 410                 borrow = (x += borrow) < borrow;
 411                 borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD64)~x;
 412         } while (--len);
 413
 414         return borrow;
 415 }
 416 #endif
 417 #endif /* !lbnSubN_64 */
 418
 419 #ifndef lbnCmp_64
 420 /*
 421  * lbnCmp_64: compare two bignums of equal length, returning the sign of
 422  * num1 - num2. (-1, 0 or +1).
 423  *
 424  * Technique: Change the little-endian pointers to big-endian pointers
 425  * and compare from the most-significant end until a difference if found.
 426  * When it is, figure out the sign of the difference and return it.
 427  */
 428 int
 429 lbnCmp_64(BNWORD64 const *num1, BNWORD64 const *num2, unsigned len)
 430 {
 431         BIGLITTLE(num1 -= len, num1 += len);
 432         BIGLITTLE(num2 -= len, num2 += len);
 433
 434         while (len--) {
 435                 if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
 436                         if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
 437                                 return -1;
 438                         else
 439                                 return 1;
 440                 }
 441         }
 442         return 0;
 443 }
 444 #endif /* !lbnCmp_64 */
 445
 446 /*
 447  * mul64_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
 448  * computes (ph,pl) = x * y + a + b.  mul64_ppmma and mul64_ppmm
 449  * are simpler versions.  If you want to be lazy, all of these
 450  * can be defined in terms of the others, so here we create any
 451  * that have not been defined in terms of the ones that have been.
 452  */
 453
 454 /* Define ones with fewer a's in terms of ones with more a's */
 455 #if !defined(mul64_ppmma) && defined(mul64_ppmmaa)
 456 #define mul64_ppmma(ph,pl,x,y,a) mul64_ppmmaa(ph,pl,x,y,a,0)
 457 #endif
 458
 459 #if !defined(mul64_ppmm) && defined(mul64_ppmma)
 460 #define mul64_ppmm(ph,pl,x,y) mul64_ppmma(ph,pl,x,y,0)
 461 #endif
 462
 463 /*
 464  * Use this definition to test the mul64_ppmm-based operations on machines
 465  * that do not provide mul64_ppmm.  Change the final "0" to a "1" to
 466  * enable it.
 467  */
 468 #if !defined(mul64_ppmm) && defined(BNWORD128) && 0     /* Debugging */
 469 #define mul64_ppmm(ph,pl,x,y) \
 470         ({BNWORD128 _ = (BNWORD128)(x)*(y); (pl) = _; (ph) = _>>64;})
 471 #endif
 472
 473 #if defined(mul64_ppmm) && !defined(mul64_ppmma)
 474 #define mul64_ppmma(ph,pl,x,y,a) \
 475         (mul64_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
 476 #endif
 477
 478 #if defined(mul64_ppmma) && !defined(mul64_ppmmaa)
 479 #define mul64_ppmmaa(ph,pl,x,y,a,b) \
 480         (mul64_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
 481 #endif
 482
 483 /*
 484  * lbnMulN1_64: Multiply an n-word input by a 1-word input and store the
 485  * n+1-word product.  This uses either the mul64_ppmm and mul64_ppmma
 486  * macros, or C multiplication with the BNWORD128 type.  This uses mul64_ppmma
 487  * if available, assuming you won't bother defining it unless you can do
 488  * better than the normal multiplication.
 489  */
 490 #ifndef lbnMulN1_64
 491 #ifdef lbnMulAdd1_64    /* If we have this asm primitive, use it. */
 492 void
 493 lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 494 {
 495         lbnZero_64(out, len);
 496         BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_64(out, in, len, k);
 497 }
 498 #elif defined(mul64_ppmm)
 499 void
 500 lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 501 {
 502         BNWORD64 carry, carryin;
 503
 504         assert(len > 0);
 505
 506         BIG(--out;--in;);
 507         mul64_ppmm(carry, *out, *in, k);
 508         LITTLE(out++;in++;)
 509
 510         while (--len) {
 511                 BIG(--out;--in;)
 512                 carryin = carry;
 513                 mul64_ppmma(carry, *out, *in, k, carryin);
 514                 LITTLE(out++;in++;)
 515         }
 516         BIGLITTLE(*--out,*out) = carry;
 517 }
 518 #elif defined(BNWORD128)
 519 void
 520 lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 521 {
 522         BNWORD128 p;
 523
 524         assert(len > 0);
 525
 526         p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
 527         BIGLITTLE(*--out,*out++) = (BNWORD64)p;
 528
 529         while (--len) {
 530                 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + (BNWORD64)(p >> 64);
 531                 BIGLITTLE(*--out,*out++) = (BNWORD64)p;
 532         }
 533         BIGLITTLE(*--out,*out) = (BNWORD64)(p >> 64);
 534 }
 535 #else
 536 #error No 64x64 -> 128 multiply available for 64-bit bignum package
 537 #endif
 538 #endif /* lbnMulN1_64 */
 539
 540 /*
 541  * lbnMulAdd1_64: Multiply an n-word input by a 1-word input and add the
 542  * low n words of the product to the destination.  *Returns the n+1st word
 543  * of the product.*  (That turns out to be more convenient than adding
 544  * it into the destination and dealing with a possible unit carry out
 545  * of *that*.)  This uses either the mul64_ppmma and mul64_ppmmaa macros,
 546  * or C multiplication with the BNWORD128 type.
 547  *
 548  * If you're going to write assembly primitives, this is the one to
 549  * start with.  It is by far the most commonly called function.
 550  */
 551 #ifndef lbnMulAdd1_64
 552 #if defined(mul64_ppmm)
 553 BNWORD64
 554 lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 555 {
 556         BNWORD64 prod, carry, carryin;
 557
 558         assert(len > 0);
 559
 560         BIG(--out;--in;);
 561         carryin = *out;
 562         mul64_ppmma(carry, *out, *in, k, carryin);
 563         LITTLE(out++;in++;)
 564
 565         while (--len) {
 566                 BIG(--out;--in;);
 567                 carryin = carry;
 568                 mul64_ppmmaa(carry, prod, *in, k, carryin, *out);
 569                 *out = prod;
 570                 LITTLE(out++;in++;)
 571         }
 572
 573         return carry;
 574 }
 575 #elif defined(BNWORD128)
 576 BNWORD64
 577 lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 578 {
 579         BNWORD128 p;
 580
 581         assert(len > 0);
 582
 583         p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
 584         BIGLITTLE(*out,*out++) = (BNWORD64)p;
 585
 586         while (--len) {
 587                 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k +
 588                     (BNWORD64)(p >> 64) + BIGLITTLE(*--out,*out);
 589                 BIGLITTLE(*out,*out++) = (BNWORD64)p;
 590         }
 591
 592         return (BNWORD64)(p >> 64);
 593 }
 594 #else
 595 #error No 64x64 -> 128 multiply available for 64-bit bignum package
 596 #endif
 597 #endif /* lbnMulAdd1_64 */
 598
 599 /*
 600  * lbnMulSub1_64: Multiply an n-word input by a 1-word input and subtract the
 601  * n-word product from the destination.  Returns the n+1st word of the product.
 602  * This uses either the mul64_ppmm and mul64_ppmma macros, or
 603  * C multiplication with the BNWORD128 type.
 604  *
 605  * This is rather uglier than adding, but fortunately it's only used in
 606  * division which is not used too heavily.
 607  */
 608 #ifndef lbnMulSub1_64
 609 #if defined(mul64_ppmm)
 610 BNWORD64
 611 lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 612 {
 613         BNWORD64 prod, carry, carryin;
 614
 615         assert(len > 0);
 616
 617         BIG(--in;)
 618         mul64_ppmm(carry, prod, *in, k);
 619         LITTLE(in++;)
 620         carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
 621
 622         while (--len) {
 623                 BIG(--in;);
 624                 carryin = carry;
 625                 mul64_ppmma(carry, prod, *in, k, carryin);
 626                 LITTLE(in++;)
 627                 carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
 628         }
 629
 630         return carry;
 631 }
 632 #elif defined(BNWORD128)
 633 BNWORD64
 634 lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
 635 {
 636         BNWORD128 p;
 637         BNWORD64 carry, t;
 638
 639         assert(len > 0);
 640
 641         p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
 642         t = BIGLITTLE(*--out,*out);
 643         carry = (BNWORD64)(p>>64) + ((BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t);
 644
 645         while (--len) {
 646                 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + carry;
 647                 t = BIGLITTLE(*--out,*out);
 648                 carry = (BNWORD64)(p>>64) +
 649                         ( (BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t );
 650         }
 651
 652         return carry;
 653 }
 654 #else
 655 #error No 64x64 -> 128 multiply available for 64-bit bignum package
 656 #endif
 657 #endif /* !lbnMulSub1_64 */
 658
 659 /*
 660  * Shift n words left "shift" bits.  0 < shift < 64.  Returns the
 661  * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
 662  */
 663 #ifndef lbnLshift_64
 664 BNWORD64
 665 lbnLshift_64(BNWORD64 *num, unsigned len, unsigned shift)
 666 {
 667         BNWORD64 x, carry;
 668
 669         assert(shift > 0);
 670         assert(shift < 64);
 671
 672         carry = 0;
 673         while (len--) {
 674                 BIG(--num;)
 675                 x = *num;
 676                 *num = (x<<shift) | carry;
 677                 LITTLE(num++;)
 678                 carry = x >> (64-shift);
 679         }
 680         return carry;
 681 }
 682 #endif /* !lbnLshift_64 */
 683
 684 /*
 685  * An optimized version of the above, for shifts of 1.
 686  * Some machines can use add-with-carry tricks for this.
 687  */
 688 #ifndef lbnDouble_64
 689 BNWORD64
 690 lbnDouble_64(BNWORD64 *num, unsigned len)
 691 {
 692         BNWORD64 x, carry;
 693
 694         carry = 0;
 695         while (len--) {
 696                 BIG(--num;)
 697                 x = *num;
 698                 *num = (x<<1) | carry;
 699                 LITTLE(num++;)
 700                 carry = x >> (64-1);
 701         }
 702         return carry;
 703 }
 704 #endif /* !lbnDouble_64 */
 705
 706 /*
 707  * Shift n words right "shift" bits.  0 < shift < 64.  Returns the
 708  * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
 709  */
 710 #ifndef lbnRshift_64
 711 BNWORD64
 712 lbnRshift_64(BNWORD64 *num, unsigned len, unsigned shift)
 713 {
 714         BNWORD64 x, carry = 0;
 715
 716         assert(shift > 0);
 717         assert(shift < 64);
 718
 719         BIGLITTLE(num -= len, num += len);
 720
 721         while (len--) {
 722                 LITTLE(--num;)
 723                 x = *num;
 724                 *num = (x>>shift) | carry;
 725                 BIG(num++;)
 726                 carry = x << (64-shift);
 727         }
 728         return carry >> (64-shift);
 729 }
 730 #endif /* !lbnRshift_64 */
 731
 732 /*
 733  * Multiply two numbers of the given lengths.  prod and num2 may overlap,
 734  * provided that the low len1 bits of prod are free.  (This corresponds
 735  * nicely to the place the result is returned from lbnMontReduce_64.)
 736  *
 737  * TODO: Use Karatsuba multiply.  The overlap constraints may have
 738  * to get rewhacked.
 739  */
 740 #ifndef lbnMul_64
 741 void
 742 lbnMul_64(BNWORD64 *prod, BNWORD64 const *num1, unsigned len1,
 743                           BNWORD64 const *num2, unsigned len2)
 744 {
 745         /* Special case of zero */
 746         if (!len1 || !len2) {
 747                 lbnZero_64(prod, len1+len2);
 748                 return;
 749         }
 750
 751         /* Multiply first word */
 752         lbnMulN1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
 753
 754         /*
 755          * Add in subsequent words, storing the most significant word,
 756          * which is new each time.
 757          */
 758         while (--len2) {
 759                 BIGLITTLE(--prod,prod++);
 760                 BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
 761                     lbnMulAdd1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
 762         }
 763 }
 764 #endif /* !lbnMul_64 */
 765
 766 /*
 767  * lbnMulX_64 is a square multiply - both inputs are the same length.
 768  * It's normally just a macro wrapper around the general multiply,
 769  * but might be implementable in assembly more efficiently (such as
 770  * when product scanning).
 771  */
 772 #ifndef lbnMulX_64
 773 #if defined(BNWORD128) && PRODUCT_SCAN
 774 /*
 775  * Test code to see whether product scanning is any faster.  It seems
 776  * to make the C code slower, so PRODUCT_SCAN is not defined.
 777  */
 778 static void
 779 lbnMulX_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
 780         unsigned len)
 781 {
 782         BNWORD128 x, y;
 783         BNWORD64 const *p1, *p2;
 784         unsigned carry;
 785         unsigned i, j;
 786
 787         /* Special case of zero */
 788         if (!len)
 789                 return;
 790
 791         x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
 792         BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
 793         x >>= 64;
 794
 795         for (i = 1; i < len; i++) {
 796                 carry = 0;
 797                 p1 = num1;
 798                 p2 = BIGLITTLE(num2-i-1,num2+i+1);
 799                 for (j = 0; j <= i; j++) {
 800                         BIG(y = (BNWORD128)*--p1 * *p2++;)
 801                         LITTLE(y = (BNWORD128)*p1++ * *--p2;)
 802                         x += y;
 803                         carry += (x < y);
 804                 }
 805                 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
 806                 x = (x >> 64) | (BNWORD128)carry << 64;
 807         }
 808         for (i = 1; i < len; i++) {
 809                 carry = 0;
 810                 p1 = BIGLITTLE(num1-i,num1+i);
 811                 p2 = BIGLITTLE(num2-len,num2+len);
 812                 for (j = i; j < len; j++) {
 813                         BIG(y = (BNWORD128)*--p1 * *p2++;)
 814                         LITTLE(y = (BNWORD128)*p1++ * *--p2;)
 815                         x += y;
 816                         carry += (x < y);
 817                 }
 818                 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
 819                 x = (x >> 64) | (BNWORD128)carry << 64;
 820         }
 821
 822         BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
 823 }
 824 #else /* !defined(BNWORD128) || !PRODUCT_SCAN */
 825 /* Default trivial macro definition */
 826 #define lbnMulX_64(prod, num1, num2, len) lbnMul_64(prod, num1, len, num2, len)
 827 #endif /* !defined(BNWORD128) || !PRODUCT_SCAN */
 828 #endif /* !lbmMulX_64 */
 829
 830 #if !defined(lbnMontMul_64) && defined(BNWORD128) && PRODUCT_SCAN
 831 /*
 832  * Test code for product-scanning multiply.  This seems to slow the C
 833  * code down rather than speed it up.
 834  * This does a multiply and Montgomery reduction together, using the
 835  * same loops.  The outer loop scans across the product, twice.
 836  * The first pass computes the low half of the product and the
 837  * Montgomery multipliers.  These are stored in the product array,
 838  * which contains no data as of yet.  x and carry add up the columns
 839  * and propagate carries forward.
 840  *
 841  * The second half multiplies the upper half, adding in the modulus
 842  * times the Montgomery multipliers.  The results of this multiply
 843  * are stored.
 844  */
 845 static void
 846 lbnMontMul_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
 847         BNWORD64 const *mod, unsigned len, BNWORD64 inv)
 848 {
 849         BNWORD128 x, y;
 850         BNWORD64 const *p1, *p2, *pm;
 851         BNWORD64 *pp;
 852         BNWORD64 t;
 853         unsigned carry;
 854         unsigned i, j;
 855
 856         /* Special case of zero */
 857         if (!len)
 858                 return;
 859
 860         /*
 861          * This computes directly into the high half of prod, so just
 862          * shift the pointer and consider prod only "len" elements long
 863          * for the rest of the code.
 864          */
 865         BIGLITTLE(prod -= len, prod += len);
 866
 867         /* Pass 1 - compute Montgomery multipliers */
 868         /* First iteration can have certain simplifications. */
 869         x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
 870         BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD64)x;
 871         y = (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]);
 872         x += y;
 873         /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
 874         carry = (x < y);
 875         assert((BNWORD64)x == 0);
 876         x = x >> 64 | (BNWORD128)carry << 64;
 877
 878         for (i = 1; i < len; i++) {
 879                 carry = 0;
 880                 p1 = num1;
 881                 p2 = BIGLITTLE(num2-i-1,num2+i+1);
 882                 pp = prod;
 883                 pm = BIGLITTLE(mod-i-1,mod+i+1);
 884                 for (j = 0; j < i; j++) {
 885                         y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
 886                         x += y;
 887                         carry += (x < y);
 888                         y = (BNWORD128)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
 889                         x += y;
 890                         carry += (x < y);
 891                 }
 892                 y = (BNWORD128)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
 893                 x += y;
 894                 carry += (x < y);
 895                 assert(BIGLITTLE(pp == prod-i, pp == prod+i));
 896                 BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD64)x;
 897                 assert(BIGLITTLE(pm == mod-1, pm == mod+1));
 898                 y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
 899                 x += y;
 900                 carry += (x < y);
 901                 assert((BNWORD64)x == 0);
 902                 x = x >> 64 | (BNWORD128)carry << 64;
 903         }
 904
 905         /* Pass 2 - compute reduced product and store */
 906         for (i = 1; i < len; i++) {
 907                 carry = 0;
 908                 p1 = BIGLITTLE(num1-i,num1+i);
 909                 p2 = BIGLITTLE(num2-len,num2+len);
 910                 pm = BIGLITTLE(mod-i,mod+i);
 911                 pp = BIGLITTLE(prod-len,prod+len);
 912                 for (j = i; j < len; j++) {
 913                         y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
 914                         x += y;
 915                         carry += (x < y);
 916                         y = (BNWORD128)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
 917                         x += y;
 918                         carry += (x < y);
 919                 }
 920                 assert(BIGLITTLE(pm == mod-len, pm == mod+len));
 921                 assert(BIGLITTLE(pp == prod-i, pp == prod+i));
 922                 BIGLITTLE(pp[0],pp[-1]) = (BNWORD64)x;
 923                 x = (x >> 64) | (BNWORD128)carry << 64;
 924         }
 925
 926         /* Last round of second half, simplified. */
 927         BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD64)x;
 928         carry = (x >> 64);
 929
 930         while (carry)
 931                 carry -= lbnSubN_64(prod, mod, len);
 932         while (lbnCmp_64(prod, mod, len) >= 0)
 933                 (void)lbnSubN_64(prod, mod, len);
 934 }
 935 /* Suppress later definition */
 936 #define lbnMontMul_64 lbnMontMul_64
 937 #endif
 938
 939 #if !defined(lbnSquare_64) && defined(BNWORD128) && PRODUCT_SCAN
 940 /*
 941  * Trial code for product-scanning squaring.  This seems to slow the C
 942  * code down rather than speed it up.
 943  */
 944 void
 945 lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
 946 {
 947         BNWORD128 x, y, z;
 948         BNWORD64 const *p1, *p2;
 949         unsigned carry;
 950         unsigned i, j;
 951
 952         /* Special case of zero */
 953         if (!len)
 954                 return;
 955
 956         /* Word 0 of product */
 957         x = (BNWORD128)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
 958         BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
 959         x >>= 64;
 960
 961         /* Words 1 through len-1 */
 962         for (i = 1; i < len; i++) {
 963                 carry = 0;
 964                 y = 0;
 965                 p1 = num;
 966                 p2 = BIGLITTLE(num-i-1,num+i+1);
 967                 for (j = 0; j < (i+1)/2; j++) {
 968                         BIG(z = (BNWORD128)*--p1 * *p2++;)
 969                         LITTLE(z = (BNWORD128)*p1++ * *--p2;)
 970                         y += z;
 971                         carry += (y < z);
 972                 }
 973                 y += z = y;
 974                 carry += carry + (y < z);
 975                 if ((i & 1) == 0) {
 976                         assert(BIGLITTLE(--p1 == p2, p1 == --p2));
 977                         BIG(z = (BNWORD128)*p2 * *p2;)
 978                         LITTLE(z = (BNWORD128)*p1 * *p1;)
 979                         y += z;
 980                         carry += (y < z);
 981                 }
 982                 x += y;
 983                 carry += (x < y);
 984                 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
 985                 x = (x >> 64) | (BNWORD128)carry << 64;
 986         }
 987         /* Words len through 2*len-2 */
 988         for (i = 1; i < len; i++) {
 989                 carry = 0;
 990                 y = 0;
 991                 p1 = BIGLITTLE(num-i,num+i);
 992                 p2 = BIGLITTLE(num-len,num+len);
 993                 for (j = 0; j < (len-i)/2; j++) {
 994                         BIG(z = (BNWORD128)*--p1 * *p2++;)
 995                         LITTLE(z = (BNWORD128)*p1++ * *--p2;)
 996                         y += z;
 997                         carry += (y < z);
 998                 }
 999                 y += z = y;
1000                 carry += carry + (y < z);
1001                 if ((len-i) & 1) {
1002                         assert(BIGLITTLE(--p1 == p2, p1 == --p2));
1003                         BIG(z = (BNWORD128)*p2 * *p2;)
1004                         LITTLE(z = (BNWORD128)*p1 * *p1;)
1005                         y += z;
1006                         carry += (y < z);
1007                 }
1008                 x += y;
1009                 carry += (x < y);
1010                 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
1011                 x = (x >> 64) | (BNWORD128)carry << 64;
1012         }
1013
1014         /* Word 2*len-1 */
1015         BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
1016 }
1017 /* Suppress later definition */
1018 #define lbnSquare_64 lbnSquare_64
1019 #endif
1020
1021 /*
1022  * Square a number, using optimized squaring to reduce the number of
1023  * primitive multiples that are executed.  There may not be any
1024  * overlap of the input and output.
1025  *
1026  * Technique: Consider the partial products in the multiplication
1027  * of "abcde" by itself:
1028  *
1029  *               a  b  c  d  e
1030  *            *  a  b  c  d  e
1031  *          ==================
1032  *              ae be ce de ee
1033  *           ad bd cd dd de
1034  *        ac bc cc cd ce
1035  *     ab bb bc bd be
1036  *  aa ab ac ad ae
1037  *
1038  * Note that everything above the main diagonal:
1039  *              ae be ce de = (abcd) * e
1040  *           ad bd cd       = (abc) * d
1041  *        ac bc             = (ab) * c
1042  *     ab                   = (a) * b
1043  *
1044  * is a copy of everything below the main diagonal:
1045  *                       de
1046  *                 cd ce
1047  *           bc bd be
1048  *     ab ac ad ae
1049  *
1050  * Thus, the sum is 2 * (off the diagonal) + diagonal.
1051  *
1052  * This is accumulated beginning with the diagonal (which
1053  * consist of the squares of the digits of the input), which is then
1054  * divided by two, the off-diagonal added, and multiplied by two
1055  * again.  The low bit is simply a copy of the low bit of the
1056  * input, so it doesn't need special care.
1057  *
1058  * TODO: Merge the shift by 1 with the squaring loop.
1059  * TODO: Use Karatsuba.  (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
1060  */
1061 #ifndef lbnSquare_64
1062 void
1063 lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
1064 {
1065         BNWORD64 t;
1066         BNWORD64 *prodx = prod;         /* Working copy of the argument */
1067         BNWORD64 const *numx = num;     /* Working copy of the argument */
1068         unsigned lenx = len;            /* Working copy of the argument */
1069
1070         if (!len)
1071                 return;
1072
1073         /* First, store all the squares */
1074         while (lenx--) {
1075 #ifdef mul64_ppmm
1076                 BNWORD64 ph, pl;
1077                 t = BIGLITTLE(*--numx,*numx++);
1078                 mul64_ppmm(ph,pl,t,t);
1079                 BIGLITTLE(*--prodx,*prodx++) = pl;
1080                 BIGLITTLE(*--prodx,*prodx++) = ph;
1081 #elif defined(BNWORD128) /* use BNWORD128 */
1082                 BNWORD128 p;
1083                 t = BIGLITTLE(*--numx,*numx++);
1084                 p = (BNWORD128)t * t;
1085                 BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)p;
1086                 BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)(p>>64);
1087 #else   /* Use lbnMulN1_64 */
1088                 t = BIGLITTLE(numx[-1],*numx);
1089                 lbnMulN1_64(prodx, numx, 1, t);
1090                 BIGLITTLE(--numx,numx++);
1091                 BIGLITTLE(prodx -= 2, prodx += 2);
1092 #endif
1093         }
1094         /* Then, shift right 1 bit */
1095         (void)lbnRshift_64(prod, 2*len, 1);
1096
1097         /* Then, add in the off-diagonal sums */
1098         lenx = len;
1099         numx = num;
1100         prodx = prod;
1101         while (--lenx) {
1102                 t = BIGLITTLE(*--numx,*numx++);
1103                 BIGLITTLE(--prodx,prodx++);
1104                 t = lbnMulAdd1_64(prodx, numx, lenx, t);
1105                 lbnAdd1_64(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
1106                 BIGLITTLE(--prodx,prodx++);
1107         }
1108
1109         /* Shift it back up */
1110         lbnDouble_64(prod, 2*len);
1111
1112         /* And set the low bit appropriately */
1113         BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
1114 }
1115 #endif /* !lbnSquare_64 */
1116
1117 /*
1118  * lbnNorm_64 - given a number, return a modified length such that the
1119  * most significant digit is non-zero.  Zero-length input is okay.
1120  */
1121 #ifndef lbnNorm_64
1122 unsigned
1123 lbnNorm_64(BNWORD64 const *num, unsigned len)
1124 {
1125         BIGLITTLE(num -= len,num += len);
1126         while (len && BIGLITTLE(*num++,*--num) == 0)
1127                 --len;
1128         return len;
1129 }
1130 #endif /* lbnNorm_64 */
1131
1132 /*
1133  * lbnBits_64 - return the number of significant bits in the array.
1134  * It starts by normalizing the array.  Zero-length input is okay.
1135  * Then assuming there's anything to it, it fetches the high word,
1136  * generates a bit length by multiplying the word length by 64, and
1137  * subtracts off 64/2, 64/4, 64/8, ... bits if the high bits are clear.
1138  */
1139 #ifndef lbnBits_64
1140 unsigned
1141 lbnBits_64(BNWORD64 const *num, unsigned len)
1142 {
1143         BNWORD64 t;
1144         unsigned i;
1145
1146         len = lbnNorm_64(num, len);
1147         if (len) {
1148                 t = BIGLITTLE(*(num-len),*(num+(len-1)));
1149                 assert(t);
1150                 len *= 64;
1151                 i = 64/2;
1152                 do {
1153                         if (t >> i)
1154                                 t >>= i;
1155                         else
1156                                 len -= i;
1157                 } while ((i /= 2) != 0);
1158         }
1159         return len;
1160 }
1161 #endif /* lbnBits_64 */
1162
1163 /*
1164  * If defined, use hand-rolled divide rather than compiler's native.
1165  * If the machine doesn't do it in line, the manual code is probably
1166  * faster, since it can assume normalization and the fact that the
1167  * quotient will fit into 64 bits, which a general 128-bit divide
1168  * in a compiler's run-time library can't do.
1169  */
1170 #ifndef BN_SLOW_DIVIDE_128
1171 /* Assume that divisors of more than thirty-two bits are slow */
1172 #define BN_SLOW_DIVIDE_128 (128 > 0x20)
1173 #endif
1174
1175 /*
1176  * Return (nh<<64|nl) % d, and place the quotient digit into *q.
1177  * It is guaranteed that nh < d, and that d is normalized (with its high
1178  * bit set).  If we have a double-width type, it's easy.  If not, ooh,
1179  * yuk!
1180  */
1181 #ifndef lbnDiv21_64
1182 #if defined(BNWORD128) && !BN_SLOW_DIVIDE_128
1183 BNWORD64
1184 lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
1185 {
1186         BNWORD128 n = (BNWORD128)nh << 64 | nl;
1187
1188         /* Divisor must be normalized */
1189         assert(d >> (64-1) == 1);
1190
1191         *q = n / d;
1192         return n % d;
1193 }
1194 #else
1195 /*
1196  * This is where it gets ugly.
1197  *
1198  * Do the division in two halves, using Algorithm D from section 4.3.1
1199  * of Knuth.  Note Theorem B from that section, that the quotient estimate
1200  * is never more than the true quotient, and is never more than two
1201  * too low.
1202  *
1203  * The mapping onto conventional long division is (everything a half word):
1204  *        _____________qh___ql_
1205  * dh dl ) nh.h nh.l nl.h nl.l
1206  *             - (qh * d)
1207  *            -----------
1208  *              rrrr rrrr nl.l
1209  *                  - (ql * d)
1210  *                -----------
1211  *                  rrrr rrrr
1212  *
1213  * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
1214  *   First, estimate a q digit so that nh/dh works.  Subtracting qh*dh from
1215  *   the (nh.h nh.l) list leaves a 1/2-word remainder r.  Then compute the
1216  *   low part of the subtractor, qh * dl.   This also needs to be subtracted
1217  *   from (nh.h nh.l nl.h) to get the final remainder.  So we take the
1218  *   remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
1219  *   try to subtract qh * dl from that.  Since the remainder is 1/2-word
1220  *   long, shifting and adding nl.h results in a single word r.
1221  *   It is possible that the remainder we're working with, r, is less than
1222  *   the product qh * dl, if we estimated qh too high.  The estimation
1223  *   technique can produce a qh that is too large (never too small), leading
1224  *   to r which is too small.  In that case, decrement the digit qh, add
1225  *   shifted dh to r (to correct for that error), and subtract dl from the
1226  *   product we're comparing r with.  That's the "correct" way to do it, but
1227  *   just adding dl to r instead of subtracting it from the product is
1228  *   equivalent and a lot simpler.  You just have to watch out for overflow.
1229  *
1230  *   The process is repeated with (rrrr rrrr nl.l) for the low digit of the
1231  *   quotient ql.
1232  *
1233  * The various uses of 64/2 for shifts are because of the note about
1234  * automatic editing of this file at the very top of the file.
1235  */
1236 #define highhalf(x) ( (x) >> 64/2 )
1237 #define lowhalf(x) ( (x) & (((BNWORD64)1 << 64/2)-1) )
1238 BNWORD64
1239 lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
1240 {
1241         BNWORD64 dh = highhalf(d), dl = lowhalf(d);
1242         BNWORD64 qh, ql, prod, r;
1243
1244         /* Divisor must be normalized */
1245         assert((d >> (64-1)) == 1);
1246
1247         /* Do first half-word of division */
1248         qh = nh / dh;
1249         r = nh % dh;
1250         prod = qh * dl;
1251
1252         /*
1253          * Add next half-word of numerator to remainder and correct.
1254          * qh may be up to two too large.
1255          */
1256         r = (r << (64/2)) | highhalf(nl);
1257         if (r < prod) {
1258                 --qh; r += d;
1259                 if (r >= d && r < prod) {
1260                         --qh; r += d;
1261                 }
1262         }
1263         r -= prod;
1264
1265         /* Do second half-word of division */
1266         ql = r / dh;
1267         r = r % dh;
1268         prod = ql * dl;
1269
1270         r = (r << (64/2)) | lowhalf(nl);
1271         if (r < prod) {
1272                 --ql; r += d;
1273                 if (r >= d && r < prod) {
1274                         --ql; r += d;
1275                 }
1276         }
1277         r -= prod;
1278
1279         *q = (qh << (64/2)) | ql;
1280
1281         return r;
1282 }
1283 #endif
1284 #endif /* lbnDiv21_64 */
1285
1286
1287 /*
1288  * In the division functions, the dividend and divisor are referred to
1289  * as "n" and "d", which stand for "numerator" and "denominator".
1290  *
1291  * The quotient is (nlen-dlen+1) digits long.  It may be overlapped with
1292  * the high (nlen-dlen) words of the dividend, but one extra word is needed
1293  * on top to hold the top word.
1294  */
1295
1296 /*
1297  * Divide an n-word number by a 1-word number, storing the remainder
1298  * and n-1 words of the n-word quotient.  The high word is returned.
1299  * It IS legal for rem to point to the same address as n, and for
1300  * q to point one word higher.
1301  *
1302  * TODO: If BN_SLOW_DIVIDE_128, add a divnhalf_64 which uses 64-bit
1303  *       dividends if the divisor is half that long.
1304  * TODO: Shift the dividend on the fly to avoid the last division and
1305  *       instead have a remainder that needs shifting.
1306  * TODO: Use reciprocals rather than dividing.
1307  */
1308 #ifndef lbnDiv1_64
1309 BNWORD64
1310 lbnDiv1_64(BNWORD64 *q, BNWORD64 *rem, BNWORD64 const *n, unsigned len,
1311         BNWORD64 d)
1312 {
1313         unsigned shift;
1314         unsigned xlen;
1315         BNWORD64 r;
1316         BNWORD64 qhigh;
1317
1318         assert(len > 0);
1319         assert(d);
1320
1321         if (len == 1) {
1322                 r = *n;
1323                 *rem = r%d;
1324                 return r/d;
1325         }
1326
1327         shift = 0;
1328         r = d;
1329         xlen = 64/2;
1330         do {
1331                 if (r >> xlen)
1332                         r >>= xlen;
1333                 else
1334                         shift += xlen;
1335         } while ((xlen /= 2) != 0);
1336         assert((d >> (64-1-shift)) == 1);
1337         d <<= shift;
1338
1339         BIGLITTLE(q -= len-1,q += len-1);
1340         BIGLITTLE(n -= len,n += len);
1341
1342         r = BIGLITTLE(*n++,*--n);
1343         if (r < d) {
1344                 qhigh = 0;
1345         } else {
1346                 qhigh = r/d;
1347                 r %= d;
1348         }
1349
1350         xlen = len;
1351         while (--xlen)
1352                 r = lbnDiv21_64(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
1353
1354         /*
1355          * Final correction for shift - shift the quotient up "shift"
1356          * bits, and merge in the extra bits of quotient.  Then reduce
1357          * the final remainder mod the real d.
1358          */
1359         if (shift) {
1360                 d >>= shift;
1361                 qhigh = (qhigh << shift) | lbnLshift_64(q, len-1, shift);
1362                 BIGLITTLE(q[-1],*q) |= r/d;
1363                 r %= d;
1364         }
1365         *rem = r;
1366
1367         return qhigh;
1368 }
1369 #endif
1370
1371 /*
1372  * This function performs a "quick" modulus of a number with a divisor
1373  * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
1374  * This applies regardless of the word size the library is compiled with.
1375  *
1376  * This function is important to prime generation, for sieving.
1377  */
1378 #ifndef lbnModQ_64
1379 /* If there's a custom lbnMod21_64, no normalization needed */
1380 #ifdef lbnMod21_64
1381 unsigned
1382 lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1383 {
1384         unsigned i, shift;
1385         BNWORD64 r;
1386
1387         assert(len > 0);
1388
1389         BIGLITTLE(n -= len,n += len);
1390
1391         /* Try using a compare to avoid the first divide */
1392         r = BIGLITTLE(*n++,*--n);
1393         if (r >= d)
1394                 r %= d;
1395         while (--len)
1396                 r = lbnMod21_64(r, BIGLITTLE(*n++,*--n), d);
1397
1398         return r;
1399 }
1400 #elif defined(BNWORD128) && !BN_SLOW_DIVIDE_128
1401 unsigned
1402 lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1403 {
1404         BNWORD64 r;
1405
1406         if (!--len)
1407                 return BIGLITTLE(n[-1],n[0]) % d;
1408
1409         BIGLITTLE(n -= len,n += len);
1410         r = BIGLITTLE(n[-1],n[0]);
1411
1412         do {
1413                 r = (BNWORD64)((((BNWORD128)r<<64) | BIGLITTLE(*n++,*--n)) % d);
1414         } while (--len);
1415
1416         return r;
1417 }
1418 #elif 64 >= 0x20
1419 /*
1420  * If the single word size can hold 65535*65536, then this function
1421  * is avilable.
1422  */
1423 #ifndef highhalf
1424 #define highhalf(x) ( (x) >> 64/2 )
1425 #define lowhalf(x) ( (x) & ((1 << 64/2)-1) )
1426 #endif
1427 unsigned
1428 lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1429 {
1430         BNWORD64 r, x;
1431
1432         BIGLITTLE(n -= len,n += len);
1433
1434         r = BIGLITTLE(*n++,*--n);
1435         while (--len) {
1436                 x = BIGLITTLE(*n++,*--n);
1437                 r = (r%d << 64/2) | highhalf(x);
1438                 r = (r%d << 64/2) | lowhalf(x);
1439         }
1440
1441         return r%d;
1442 }
1443 #else
1444 /* Default case - use lbnDiv21_64 */
1445 unsigned
1446 lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1447 {
1448         unsigned i, shift;
1449         BNWORD64 r;
1450         BNWORD64 q;
1451
1452         assert(len > 0);
1453
1454         shift = 0;
1455         r = d;
1456         i = 64;
1457         while (i /= 2) {
1458                 if (r >> i)
1459                         r >>= i;
1460                 else
1461                         shift += i;
1462         }
1463         assert(d >> (64-1-shift) == 1);
1464         d <<= shift;
1465
1466         BIGLITTLE(n -= len,n += len);
1467
1468         r = BIGLITTLE(*n++,*--n);
1469         if (r >= d)
1470                 r %= d;
1471
1472         while (--len)
1473                 r = lbnDiv21_64(&q, r, BIGLITTLE(*n++,*--n), d);
1474
1475         /*
1476          * Final correction for shift - shift the quotient up "shift"
1477          * bits, and merge in the extra bits of quotient.  Then reduce
1478          * the final remainder mod the real d.
1479          */
1480         if (shift)
1481                 r %= d >> shift;
1482
1483         return r;
1484 }
1485 #endif
1486 #endif /* lbnModQ_64 */
1487
1488 /*
1489  * Reduce n mod d and return the quotient.  That is, find:
1490  * q = n / d;
1491  * n = n % d;
1492  * d is altered during the execution of this subroutine by normalizing it.
1493  * It must already have its most significant word non-zero; it is shifted
1494  * so its most significant bit is non-zero.
1495  *
1496  * The quotient q is nlen-dlen+1 words long.  To make it possible to
1497  * overlap the quptient with the input (you can store it in the high dlen
1498  * words), the high word of the quotient is *not* stored, but is returned.
1499  * (If all you want is the remainder, you don't care about it, anyway.)
1500  *
1501  * This uses algorithm D from Knuth (4.3.1), except that we do binary
1502  * (shift) normalization of the divisor.  WARNING: This is hairy!
1503  *
1504  * This function is used for some modular reduction, but it is not used in
1505  * the modular exponentiation loops; they use Montgomery form and the
1506  * corresponding, more efficient, Montgomery reduction.  This code
1507  * is needed for the conversion to Montgomery form, however, so it
1508  * has to be here and it might as well be reasonably efficient.
1509  *
1510  * The overall operation is as follows ("top" and "up" refer to the
1511  * most significant end of the number; "bottom" and "down", the least):
1512  *
1513  * - Shift the divisor up until the most significant bit is set.
1514  * - Shift the dividend up the same amount.  This will produce the
1515  *   correct quotient, and the remainder can be recovered by shifting
1516  *   it back down the same number of bits.  This may produce an overflow
1517  *   word, but the word is always strictly less than the most significant
1518  *   divisor word.
1519  * - Estimate the first quotient digit qhat:
1520  *   - First take the top two words (one of which is the overflow) of the
1521  *     dividend and divide by the top word of the divisor:
1522  *     qhat = (nh,nm)/dh.  This qhat is >= the correct quotient digit
1523  *     and, since dh is normalized, it is at most two over.
1524  *   - Second, correct by comparing the top three words.  If
1525  *     (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
1526  *     The second iteration can be simpler because there can't be a third.
1527  *     The computation can be simplified by subtracting dh*qhat from
1528  *     both sides, suitably shifted.  This reduces the left side to
1529  *     dl*qhat.  On the right, (nh,nm)-dh*qhat is simply the
1530  *     remainder r from (nh,nm)%dh, so the right is (r,nl).
1531  *     This produces qhat that is almost always correct and at
1532  *     most (prob ~ 2/2^64) one too high.
1533  * - Subtract qhat times the divisor (suitably shifted) from the dividend.
1534  *   If there is a borrow, qhat was wrong, so decrement it
1535  *   and add the divisor back in (once).
1536  * - Store the final quotient digit qhat in the quotient array q.
1537  *
1538  * Repeat the quotient digit computation for successive digits of the
1539  * quotient until the whole quotient has been computed.  Then shift the
1540  * divisor and the remainder down to correct for the normalization.
1541  *
1542  * TODO: Special case 2-word divisors.
1543  * TODO: Use reciprocals rather than dividing.
1544  */
1545 #ifndef divn_64
1546 BNWORD64
1547 lbnDiv_64(BNWORD64 *q, BNWORD64 *n, unsigned nlen, BNWORD64 *d, unsigned dlen)
1548 {
1549         BNWORD64 nh,nm,nl;      /* Top three words of the dividend */
1550         BNWORD64 dh,dl; /* Top two words of the divisor */
1551         BNWORD64 qhat;  /* Extimate of quotient word */
1552         BNWORD64 r;     /* Remainder from quotient estimate division */
1553         BNWORD64 qhigh; /* High word of quotient */
1554         unsigned i;     /* Temp */
1555         unsigned shift; /* Bits shifted by normalization */
1556         unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
1557 #ifdef mul64_ppmm
1558         BNWORD64 t64;
1559 #elif defined(BNWORD128)
1560         BNWORD128 t128;
1561 #else /* use lbnMulN1_64 */
1562         BNWORD64 t2[2];
1563 #define t2high BIGLITTLE(t2[0],t2[1])
1564 #define t2low BIGLITTLE(t2[1],t2[0])
1565 #endif
1566
1567         assert(dlen);
1568         assert(nlen >= dlen);
1569
1570         /*
1571          * Special cases for short divisors.  The general case uses the
1572          * top top 2 digits of the divisor (d) to estimate a quotient digit,
1573          * so it breaks if there are fewer digits available.  Thus, we need
1574          * special cases for a divisor of length 1.  A divisor of length
1575          * 2 can have a *lot* of administrivia overhead removed removed,
1576          * so it's probably worth special-casing that case, too.
1577          */
1578         if (dlen == 1)
1579                 return lbnDiv1_64(q, BIGLITTLE(n-1,n), n, nlen,
1580                                   BIGLITTLE(d[-1],d[0]));
1581
1582 #if 0
1583         /*
1584          * @@@ This is not yet written...  The general loop will do,
1585          * albeit less efficiently
1586          */
1587         if (dlen == 2) {
1588                 /*
1589                  * divisor two digits long:
1590                  * use the 3/2 technique from Knuth, but we know
1591                  * it's exact.
1592                  */
1593                 dh = BIGLITTLE(d[-1],d[0]);
1594                 dl = BIGLITTLE(d[-2],d[1]);
1595                 shift = 0;
1596                 if ((sh & ((BNWORD64)1 << 64-1-shift)) == 0) {
1597                         do {
1598                                 shift++;
1599                         } while (dh & (BNWORD64)1<<64-1-shift) == 0);
1600                         dh = dh << shift | dl >> (64-shift);
1601                         dl <<= shift;
1602
1603
1604                 }
1605
1606
1607                 for (shift = 0; (dh & (BNWORD64)1 << 64-1-shift)) == 0; shift++)
1608                         ;
1609                 if (shift) {
1610                 }
1611                 dh = dh << shift | dl >> (64-shift);
1612                 shift = 0;
1613                 while (dh
1614         }
1615 #endif
1616
1617         dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
1618         assert(dh);
1619
1620         /* Normalize the divisor */
1621         shift = 0;
1622         r = dh;
1623         i = 64/2;
1624         do {
1625                 if (r >> i)
1626                         r >>= i;
1627                 else
1628                         shift += i;
1629         } while ((i /= 2) != 0);
1630
1631         nh = 0;
1632         if (shift) {
1633                 lbnLshift_64(d, dlen, shift);
1634                 dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
1635                 nh = lbnLshift_64(n, nlen, shift);
1636         }
1637
1638         /* Assert that dh is now normalized */
1639         assert(dh >> (64-1));
1640
1641         /* Also get the second-most significant word of the divisor */
1642         dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
1643
1644         /*
1645          * Adjust pointers: n to point to least significant end of first
1646          * first subtract, and q to one the most-significant end of the
1647          * quotient array.
1648          */
1649         BIGLITTLE(n -= qlen,n += qlen);
1650         BIGLITTLE(q -= qlen,q += qlen);
1651
1652         /* Fetch the most significant stored word of the dividend */
1653         nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1654
1655         /*
1656          * Compute the first digit of the quotient, based on the
1657          * first two words of the dividend (the most significant of which
1658          * is the overflow word h).
1659          */
1660         if (nh) {
1661                 assert(nh < dh);
1662                 r = lbnDiv21_64(&qhat, nh, nm, dh);
1663         } else if (nm >= dh) {
1664                 qhat = nm/dh;
1665                 r = nm % dh;
1666         } else {        /* Quotient is zero */
1667                 qhigh = 0;
1668                 goto divloop;
1669         }
1670
1671         /* Now get the third most significant word of the dividend */
1672         nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
1673
1674         /*
1675          * Correct qhat, the estimate of quotient digit.
1676          * qhat can only be high, and at most two words high,
1677          * so the loop can be unrolled and abbreviated.
1678          */
1679 #ifdef mul64_ppmm
1680         mul64_ppmm(nm, t64, qhat, dl);
1681         if (nm > r || (nm == r && t64 > nl)) {
1682                 /* Decrement qhat and adjust comparison parameters */
1683                 qhat--;
1684                 if ((r += dh) >= dh) {
1685                         nm -= (t64 < dl);
1686                         t64 -= dl;
1687                         if (nm > r || (nm == r && t64 > nl))
1688                                 qhat--;
1689                 }
1690         }
1691 #elif defined(BNWORD128)
1692         t128 = (BNWORD128)qhat * dl;
1693         if (t128 > ((BNWORD128)r << 64) + nl) {
1694                 /* Decrement qhat and adjust comparison parameters */
1695                 qhat--;
1696                 if ((r += dh) > dh) {
1697                         t128 -= dl;
1698                         if (t128 > ((BNWORD128)r << 64) + nl)
1699                                 qhat--;
1700                 }
1701         }
1702 #else /* Use lbnMulN1_64 */
1703         lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
1704         if (t2high > r || (t2high == r && t2low > nl)) {
1705                 /* Decrement qhat and adjust comparison parameters */
1706                 qhat--;
1707                 if ((r += dh) >= dh) {
1708                         t2high -= (t2low < dl);
1709                         t2low -= dl;
1710                         if (t2high > r || (t2high == r && t2low > nl))
1711                                 qhat--;
1712                 }
1713         }
1714 #endif
1715
1716         /* Do the multiply and subtract */
1717         r = lbnMulSub1_64(n, d, dlen, qhat);
1718         /* If there was a borrow, add back once. */
1719         if (r > nh) {   /* Borrow? */
1720                 (void)lbnAddN_64(n, d, dlen);
1721                 qhat--;
1722         }
1723
1724         /* Remember the first quotient digit. */
1725         qhigh = qhat;
1726
1727         /* Now, the main division loop: */
1728 divloop:
1729         while (qlen--) {
1730
1731                 /* Advance n */
1732                 nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1733                 BIGLITTLE(++n,--n);
1734                 nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1735
1736                 if (nh == dh) {
1737                         qhat = ~(BNWORD64)0;
1738                         /* Optimized computation of r = (nh,nm) - qhat * dh */
1739                         r = nh + nm;
1740                         if (r < nh)
1741                                 goto subtract;
1742                 } else {
1743                         assert(nh < dh);
1744                         r = lbnDiv21_64(&qhat, nh, nm, dh);
1745                 }
1746
1747                 nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
1748 #ifdef mul64_ppmm
1749                 mul64_ppmm(nm, t64, qhat, dl);
1750                 if (nm > r || (nm == r && t64 > nl)) {
1751                         /* Decrement qhat and adjust comparison parameters */
1752                         qhat--;
1753                         if ((r += dh) >= dh) {
1754                                 nm -= (t64 < dl);
1755                                 t64 -= dl;
1756                                 if (nm > r || (nm == r && t64 > nl))
1757                                         qhat--;
1758                         }
1759                 }
1760 #elif defined(BNWORD128)
1761                 t128 = (BNWORD128)qhat * dl;
1762                 if (t128 > ((BNWORD128)r<<64) + nl) {
1763                         /* Decrement qhat and adjust comparison parameters */
1764                         qhat--;
1765                         if ((r += dh) >= dh) {
1766                                 t128 -= dl;
1767                                 if (t128 > ((BNWORD128)r << 64) + nl)
1768                                         qhat--;
1769                         }
1770                 }
1771 #else /* Use lbnMulN1_64 */
1772                 lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
1773                 if (t2high > r || (t2high == r && t2low > nl)) {
1774                         /* Decrement qhat and adjust comparison parameters */
1775                         qhat--;
1776                         if ((r += dh) >= dh) {
1777                                 t2high -= (t2low < dl);
1778                                 t2low -= dl;
1779                                 if (t2high > r || (t2high == r && t2low > nl))
1780                                         qhat--;
1781                         }
1782                 }
1783 #endif
1784
1785                 /*
1786                  * As a point of interest, note that it is not worth checking
1787                  * for qhat of 0 or 1 and installing special-case code.  These
1788                  * occur with probability 2^-64, so spending 1 cycle to check
1789                  * for them is only worth it if we save more than 2^15 cycles,
1790                  * and a multiply-and-subtract for numbers in the 1024-bit
1791                  * range just doesn't take that long.
1792                  */
1793 subtract:
1794                 /*
1795                  * n points to the least significant end of the substring
1796                  * of n to be subtracted from.  qhat is either exact or
1797                  * one too large.  If the subtract gets a borrow, it was
1798                  * one too large and the divisor is added back in.  It's
1799                  * a dlen+1 word add which is guaranteed to produce a
1800                  * carry out, so it can be done very simply.
1801                  */
1802                 r = lbnMulSub1_64(n, d, dlen, qhat);
1803                 if (r > nh) {   /* Borrow? */
1804                         (void)lbnAddN_64(n, d, dlen);
1805                         qhat--;
1806                 }
1807                 /* Store the quotient digit */
1808                 BIGLITTLE(*q++,*--q) = qhat;
1809         }
1810         /* Tah dah! */
1811
1812         if (shift) {
1813                 lbnRshift_64(d, dlen, shift);
1814                 lbnRshift_64(n, dlen, shift);
1815         }
1816
1817         return qhigh;
1818 }
1819 #endif
1820
1821 /*
1822  * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^64.
1823  *
1824  * This just performs Newton's iteration until it gets the
1825  * inverse.  The initial estimate is always correct to 3 bits, and
1826  * sometimes 4.  The number of valid bits doubles each iteration.
1827  * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
1828  * for the error mod 2^2n.  x * y == 1 + k*2^n (mod 2^2n) and follow
1829  * the iteration through.)
1830  */
1831 #ifndef lbnMontInv1_64
1832 BNWORD64
1833 lbnMontInv1_64(BNWORD64 const x)
1834 {
1835         BNWORD64 y = x, z;
1836
1837         assert(x & 1);
1838
1839         while ((z = x*y) != 1)
1840                 y *= 2 - z;
1841         return -y;
1842 }
1843 #endif /* !lbnMontInv1_64 */
1844
1845 #if defined(BNWORD128) && PRODUCT_SCAN
1846 /*
1847  * Test code for product-scanning Montgomery reduction.
1848  * This seems to slow the C code down rather than speed it up.
1849  *
1850  * The first loop computes the Montgomery multipliers, storing them over
1851  * the low half of the number n.
1852  *
1853  * The second half multiplies the upper half, adding in the modulus
1854  * times the Montgomery multipliers.  The results of this multiply
1855  * are stored.
1856  */
1857 void
1858 lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned mlen, BNWORD64 inv)
1859 {
1860         BNWORD128 x, y;
1861         BNWORD64 const *pm;
1862         BNWORD64 *pn;
1863         BNWORD64 t;
1864         unsigned carry;
1865         unsigned i, j;
1866
1867         /* Special case of zero */
1868         if (!mlen)
1869                 return;
1870
1871         /* Pass 1 - compute Montgomery multipliers */
1872         /* First iteration can have certain simplifications. */
1873         t = BIGLITTLE(n[-1],n[0]);
1874         x = t;
1875         t *= inv;
1876         BIGLITTLE(n[-1], n[0]) = t;
1877         x += (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
1878         assert((BNWORD64)x == 0);
1879         x = x >> 64;
1880
1881         for (i = 1; i < mlen; i++) {
1882                 carry = 0;
1883                 pn = n;
1884                 pm = BIGLITTLE(mod-i-1,mod+i+1);
1885                 for (j = 0; j < i; j++) {
1886                         y = (BNWORD128)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
1887                         x += y;
1888                         carry += (x < y);
1889                 }
1890                 assert(BIGLITTLE(pn == n-i, pn == n+i));
1891                 y = t = BIGLITTLE(pn[-1], pn[0]);
1892                 x += y;
1893                 carry += (x < y);
1894                 BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD64)x;
1895                 assert(BIGLITTLE(pm == mod-1, pm == mod+1));
1896                 y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
1897                 x += y;
1898                 carry += (x < y);
1899                 assert((BNWORD64)x == 0);
1900                 x = x >> 64 | (BNWORD128)carry << 64;
1901         }
1902
1903         BIGLITTLE(n -= mlen, n += mlen);
1904
1905         /* Pass 2 - compute upper words and add to n */
1906         for (i = 1; i < mlen; i++) {
1907                 carry = 0;
1908                 pm = BIGLITTLE(mod-i,mod+i);
1909                 pn = n;
1910                 for (j = i; j < mlen; j++) {
1911                         y = (BNWORD128)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
1912                         x += y;
1913                         carry += (x < y);
1914                 }
1915                 assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
1916                 assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
1917                 y = t = BIGLITTLE(*(n-i),*(n+i-1));
1918                 x += y;
1919                 carry += (x < y);
1920                 BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD64)x;
1921                 x = (x >> 64) | (BNWORD128)carry << 64;
1922         }
1923
1924         /* Last round of second half, simplified. */
1925         t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
1926         x += t;
1927         BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD64)x;
1928         carry = (unsigned)(x >> 64);
1929
1930         while (carry)
1931                 carry -= lbnSubN_64(n, mod, mlen);
1932         while (lbnCmp_64(n, mod, mlen) >= 0)
1933                 (void)lbnSubN_64(n, mod, mlen);
1934 }
1935 #define lbnMontReduce_64 lbnMontReduce_64
1936 #endif
1937
1938 /*
1939  * Montgomery reduce n, modulo mod.  This reduces modulo mod and divides by
1940  * 2^(64*mlen).  Returns the result in the *top* mlen words of the argument n.
1941  * This is ready for another multiplication using lbnMul_64.
1942  *
1943  * Montgomery representation is a very useful way to encode numbers when
1944  * you're doing lots of modular reduction.  What you do is pick a multiplier
1945  * R which is relatively prime to the modulus and very easy to divide by.
1946  * Since the modulus is odd, R is closen as a power of 2, so the division
1947  * is a shift.  In fact, it's a shift of an integral number of words,
1948  * so the shift can be implicit - just drop the low-order words.
1949  *
1950  * Now, choose R *larger* than the modulus m, 2^(64*mlen).  Then convert
1951  * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
1952  * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc.  Note that:
1953  * - The Montgomery form of a number depends on the modulus m.
1954  *   A fixed modulus m is assumed throughout this discussion.
1955  * - Since R is relaitvely prime to m, multiplication by R is invertible;
1956  *   no information about the numbers is lost, they're just scrambled.
1957  * - Adding (and subtracting) numbers in this form works just as usual.
1958  *   M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
1959  * - Multiplying numbers in this form produces a*b*R*R.  The problem
1960  *   is to divide out the excess factor of R, modulo m as well as to
1961  *   reduce to the given length mlen.  It turns out that this can be
1962  *   done *faster* than a normal divide, which is where the speedup
1963  *   in Montgomery division comes from.
1964  *
1965  * Normal reduction chooses a most-significant quotient digit q and then
1966  * subtracts q*m from the number to be reduced.  Choosing q is tricky
1967  * and involved (just look at lbnDiv_64 to see!) and is usually
1968  * imperfect, requiring a check for correction after the subtraction.
1969  *
1970  * Montgomery reduction *adds* a multiple of m to the *low-order* part
1971  * of the number to be reduced.  This multiple is chosen to make the
1972  * low-order part of the number come out to zero.  This can be done
1973  * with no trickery or error using a precomputed inverse of the modulus.
1974  * In this code, the "part" is one word, but any width can be used.
1975  *
1976  * Repeating this step sufficiently often results in a value which
1977  * is a multiple of R (a power of two, remember) but is still (since
1978  * the additions were to the low-order part and thus did not increase
1979  * the value of the number being reduced very much) still not much
1980  * larger than m*R.  Then implicitly divide by R and subtract off
1981  * m until the result is in the correct range.
1982  *
1983  * Since the low-order part being cancelled is less than R, the
1984  * multiple of m added must have a multiplier which is at most R-1.
1985  * Assuming that the input is at most m*R-1, the final number is
1986  * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
1987  * the high-order part, equivalent to subtracting m*R from the
1988  * while number, produces a result which is at most m*R - m - 1,
1989  * which divided by R is at most m-1.
1990  *
1991  * To convert *to* Montgomery form, you need a regular remainder
1992  * routine, although you can just compute R*R (mod m) and do the
1993  * conversion using Montgomery multiplication.  To convert *from*
1994  * Montgomery form, just Montgomery reduce the number to
1995  * remove the extra factor of R.
1996  *
1997  * TODO: Change to a full inverse and use Karatsuba's multiplication
1998  * rather than this word-at-a-time.
1999  */
2000 #ifndef lbnMontReduce_64
2001 void
2002 lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
2003                 BNWORD64 inv)
2004 {
2005         BNWORD64 t;
2006         BNWORD64 c = 0;
2007         unsigned len = mlen;
2008
2009         /* inv must be the negative inverse of mod's least significant word */
2010         assert((BNWORD64)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD64)-1);
2011
2012         assert(len);
2013
2014         do {
2015                 t = lbnMulAdd1_64(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
2016                 c += lbnAdd1_64(BIGLITTLE(n-mlen,n+mlen), len, t);
2017                 BIGLITTLE(--n,++n);
2018         } while (--len);
2019
2020         /*
2021          * All that adding can cause an overflow past the modulus size,
2022          * but it's unusual, and never by much, so a subtraction loop
2023          * is the right way to deal with it.
2024          * This subtraction happens infrequently - I've only ever seen it
2025          * invoked once per reduction, and then just under 22.5% of the time.
2026          */
2027         while (c)
2028                 c -= lbnSubN_64(n, mod, mlen);
2029         while (lbnCmp_64(n, mod, mlen) >= 0)
2030                 (void)lbnSubN_64(n, mod, mlen);
2031 }
2032 #endif /* !lbnMontReduce_64 */
2033
2034 /*
2035  * A couple of helpers that you might want to implement atomically
2036  * in asm sometime.
2037  */
2038 #ifndef lbnMontMul_64
2039 /*
2040  * Multiply "num1" by "num2", modulo "mod", all of length "len", and
2041  * place the result in the high half of "prod".  "inv" is the inverse
2042  * of the least-significant word of the modulus, modulo 2^64.
2043  * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
2044  *
2045  * This is implemented as a macro to win on compilers that don't do
2046  * inlining, since it's so trivial.
2047  */
2048 #define lbnMontMul_64(prod, n1, n2, mod, len, inv) \
2049         (lbnMulX_64(prod, n1, n2, len), lbnMontReduce_64(prod, mod, len, inv))
2050 #endif /* !lbnMontMul_64 */
2051
2052 #ifndef lbnMontSquare_64
2053 /*
2054  * Square "n", modulo "mod", both of length "len", and place the result
2055  * in the high half of "prod".  "inv" is the inverse of the least-significant
2056  * word of the modulus, modulo 2^64.
2057  * This uses numbers in Montgomery form.  Reduce using "len" and "inv".
2058  *
2059  * This is implemented as a macro to win on compilers that don't do
2060  * inlining, since it's so trivial.
2061  */
2062 #define lbnMontSquare_64(prod, n, mod, len, inv) \
2063         (lbnSquare_64(prod, n, len), lbnMontReduce_64(prod, mod, len, inv))
2064
2065 #endif /* !lbnMontSquare_64 */
2066
2067 /*
2068  * Convert a number to Montgomery form - requires mlen + nlen words
2069  * of memory in "n".
2070  */
2071 void
2072 lbnToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen)
2073 {
2074         /* Move n up "mlen" words */
2075         lbnCopy_64(BIGLITTLE(n-mlen,n+mlen), n, nlen);
2076         lbnZero_64(n, mlen);
2077         /* Do the division - dump the quotient in the high-order words */
2078         (void)lbnDiv_64(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
2079 }
2080
2081 /*
2082  * Convert from Montgomery form.  Montgomery reduction is all that is
2083  * needed.
2084  */
2085 void
2086 lbnFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len)
2087 {
2088         /* Zero the high words of n */
2089         lbnZero_64(BIGLITTLE(n-len,n+len), len);
2090         lbnMontReduce_64(n, mod, len, lbnMontInv1_64(mod[BIGLITTLE(-1,0)]));
2091         /* Move n down len words */
2092         lbnCopy_64(n, BIGLITTLE(n-len,n+len), len);
2093 }
2094
2095 /*
2096  * The windowed exponentiation algorithm, precomputes a table of odd
2097  * powers of n up to 2^k.  See the comment in bnExpMod_64 below for
2098  * an explanation of how it actually works works.
2099  *
2100  * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
2101  * multiplies (on average) to perform the exponentiation.  To minimize
2102  * the sum, k must vary with e.  The optimal window sizes vary with the
2103  * exponent length.  Here are some selected values and the boundary cases.
2104  * (An underscore _ has been inserted into some of the numbers to ensure
2105  * that magic strings like 64 do not appear in this table.  It should be
2106  * ignored.)
2107  *
2108  * At e =    1 bits, k=1   (0.000000) is best
2109  * At e =    2 bits, k=1   (0.500000) is best
2110  * At e =    4 bits, k=1   (1.500000) is best
2111  * At e =    8 bits, k=2   (3.333333) < k=1   (3.500000)
2112  * At e =  1_6 bits, k=2   (6.000000) is best
2113  * At e =   26 bits, k=3   (9.250000) < k=2   (9.333333)
2114  * At e =  3_2 bits, k=3  (10.750000) is best
2115  * At e =  6_4 bits, k=3  (18.750000) is best
2116  * At e =   82 bits, k=4  (23.200000) < k=3  (23.250000)
2117  * At e =  128 bits, k=4 (3_2.400000) is best
2118  * At e =  242 bits, k=5  (55.1_66667) < k=4 (55.200000)
2119  * At e =  256 bits, k=5  (57.500000) is best
2120  * At e =  512 bits, k=5 (100.1_66667) is best
2121  * At e =  674 bits, k=6 (127.142857) < k=5 (127.1_66667)
2122  * At e = 1024 bits, k=6 (177.142857) is best
2123  * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
2124  * At e = 2048 bits, k=7 (318.875000) is best
2125  * At e = 4096 bits, k=7 (574.875000) is best
2126  *
2127  * The numbers in parentheses are the expected number of multiplications
2128  * needed to do the computation.  The normal russian-peasant modular
2129  * exponentiation technique always uses (e-1)/2.  For exponents as
2130  * small as 192 bits (below the range of current factoring algorithms),
2131  * half of the multiplies are eliminated, 45.2 as opposed to the naive
2132  * 95.5.  Counting the 191 squarings as 3/4 a multiply each (squaring
2133  * proper is just over half of multiplying, but the Montgomery
2134  * reduction in each case is also a multiply), that's 143.25
2135  * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
2136  * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
2137  * 24.3% savings.  It asymptotically approaches 25%.
2138  *
2139  * Um, actually there's a slightly more accurate way to count, which
2140  * really is the average number of multiplies required, averaged
2141  * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
2142  * It's based on the recurrence that for the last b bits, b <= k, at
2143  * most one multiply is needed (and none at all 1/2^b of the time),
2144  * while when b > k, the odds are 1/2 each way that the bit will be
2145  * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
2146  * 1/2 that the bit will be 1, starting a k-bit window and requiring
2147  * 1 multiply beyond the b-k-bit case.  Since the most significant
2148  * bit is always 1, a k-bit window always starts there, and that
2149  * multiply is by 1, so it isn't a multiply at all.  Thus, the
2150  * number of multiplies is simply that needed for the last e-k bits.
2151  * This recurrence produces:
2152  *
2153  * At e =    1 bits, k=1   (0.000000) is best
2154  * At e =    2 bits, k=1   (0.500000) is best
2155  * At e =    4 bits, k=1   (1.500000) is best
2156  * At e =    6 bits, k=2   (2.437500) < k=1   (2.500000)
2157  * At e =    8 bits, k=2   (3.109375) is best
2158  * At e =  1_6 bits, k=2   (5.777771) is best
2159  * At e =   24 bits, k=3   (8.437629) < k=2   (8.444444)
2160  * At e =  3_2 bits, k=3  (10.437492) is best
2161  * At e =  6_4 bits, k=3  (18.437500) is best
2162  * At e =   81 bits, k=4  (22.6_40000) < k=3  (22.687500)
2163  * At e =  128 bits, k=4 (3_2.040000) is best
2164  * At e =  241 bits, k=5  (54.611111) < k=4  (54.6_40000)
2165  * At e =  256 bits, k=5  (57.111111) is best
2166  * At e =  512 bits, k=5  (99.777778) is best
2167  * At e =  673 bits, k=6 (126.591837) < k=5 (126.611111)
2168  * At e = 1024 bits, k=6 (176.734694) is best
2169  * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
2170  * At e = 2048 bits, k=7 (318.453125) is best
2171  * At e = 4096 bits, k=7 (574.453125) is best
2172  *
2173  * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
2174  * of 8, 26, 82, 242, 674, and 1794.  Not a very big difference.
2175  * (The numbers past that are k=8 at 4609 and k=9 at 11521,
2176  * vs. one more in each case for the approximation.)
2177  *
2178  * Given that exponents for which k>7 are useful are uncommon,
2179  * a fixed size table for k <= 7 is used for simplicity.
2180  *
2181  * The basic number of squarings needed is e-1, although a k-bit
2182  * window (for k > 1) can save, on average, k-2 of those, too.
2183  * That savings currently isn't counted here.  It would drive the
2184  * crossover points slightly lower.
2185  * (Actually, this win is also reduced in the DoubleExpMod case,
2186  * meaning we'd have to split the tables.  Except for that, the
2187  * multiplies by powers of the two bases are independent, so
2188  * the same logic applies to each as the single case.)
2189  *
2190  * Table entry i is the largest number of bits in an exponent to
2191  * process with a window size of i+1.  Entry 6 is the largest
2192  * possible unsigned number, so the window will never be more
2193  * than 7 bits, requiring 2^6 = 0x40 slots.
2194  */
2195 #define BNEXPMOD_MAX_WINDOW     7
2196 static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
2197         5, 23, 80, 240, 672, 1792, (unsigned)-1
2198 /*      7, 25, 81, 241, 673, 1793, (unsigned)-1  ### The old approximations */
2199 };
2200
2201 /*
2202  * Perform modular exponentiation, as fast as possible!  This uses
2203  * Montgomery reduction, optimized squaring, and windowed exponentiation.
2204  * The modulus "mod" MUST be odd!
2205  *
2206  * This returns 0 on success, -1 on out of memory.
2207  *
2208  * The window algorithm:
2209  * The idea is to keep a running product of b1 = n^(high-order bits of exp),
2210  * and then keep appending exponent bits to it.  The following patterns
2211  * apply to a 3-bit window (k = 3):
2212  * To append   0: square
2213  * To append   1: square, multiply by n^1
2214  * To append  10: square, multiply by n^1, square
2215  * To append  11: square, square, multiply by n^3
2216  * To append 100: square, multiply by n^1, square, square
2217  * To append 101: square, square, square, multiply by n^5
2218  * To append 110: square, square, multiply by n^3, square
2219  * To append 111: square, square, square, multiply by n^7
2220  *
2221  * Since each pattern involves only one multiply, the longer the pattern
2222  * the better, except that a 0 (no multiplies) can be appended directly.
2223  * We precompute a table of odd powers of n, up to 2^k, and can then
2224  * multiply k bits of exponent at a time.  Actually, assuming random
2225  * exponents, there is on average one zero bit between needs to
2226  * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
2227  * 1/8 of the time, there's 2, 1/64 of the time, there's 3, etc.), so
2228  * you have to do one multiply per k+1 bits of exponent.
2229  *
2230  * The loop walks down the exponent, squaring the result buffer as
2231  * it goes.  There is a wbits+1 bit lookahead buffer, buf, that is
2232  * filled with the upcoming exponent bits.  (What is read after the
2233  * end of the exponent is unimportant, but it is filled with zero here.)
2234  * When the most-significant bit of this buffer becomes set, i.e.
2235  * (buf & tblmask) != 0, we have to decide what pattern to multiply
2236  * by, and when to do it.  We decide, remember to do it in future
2237  * after a suitable number of squarings have passed (e.g. a pattern
2238  * of "100" in the buffer requires that we multiply by n^1 immediately;
2239  * a pattern of "110" calls for multiplying by n^3 after one more
2240  * squaring), clear the buffer, and continue.
2241  *
2242  * When we start, there is one more optimization: the result buffer
2243  * is implcitly one, so squaring it or multiplying by it can be
2244  * optimized away.  Further, if we start with a pattern like "100"
2245  * in the lookahead window, rather than placing n into the buffer
2246  * and then starting to square it, we have already computed n^2
2247  * to compute the odd-powers table, so we can place that into
2248  * the buffer and save a squaring.
2249  *
2250  * This means that if you have a k-bit window, to compute n^z,
2251  * where z is the high k bits of the exponent, 1/2 of the time
2252  * it requires no squarings.  1/4 of the time, it requires 1
2253  * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
2254  * And the remaining 1/2^(k-1) of the time, the top k bits are a
2255  * 1 followed by k-1 0 bits, so it again only requires k-2
2256  * squarings, not k-1.  The average of these is 1.  Add that
2257  * to the one squaring we have to do to compute the table,
2258  * and you'll see that a k-bit window saves k-2 squarings
2259  * as well as reducing the multiplies.  (It actually doesn't
2260  * hurt in the case k = 1, either.)
2261  *
2262  * n must have mlen words allocated.  Although fewer may be in use
2263  * when n is passed in, all are in use on exit.
2264  */
2265 int
2266 lbnExpMod_64(BNWORD64 *result, BNWORD64 const *n, unsigned nlen,
2267         BNWORD64 const *e, unsigned elen, BNWORD64 *mod, unsigned mlen)
2268 {
2269         BNWORD64 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
2270                                 /* Table of odd powers of n */
2271         unsigned ebits;         /* Exponent bits */
2272         unsigned wbits;         /* Window size */
2273         unsigned tblmask;       /* Mask of exponentiation window */
2274         BNWORD64 bitpos;        /* Mask of current look-ahead bit */
2275         unsigned buf;           /* Buffer of exponent bits */
2276         unsigned multpos;       /* Where to do pending multiply */
2277         BNWORD64 const *mult;   /* What to multiply by */
2278         unsigned i;             /* Loop counter */
2279         int isone;              /* Flag: accum. is implicitly one */
2280         BNWORD64 *a, *b;        /* Working buffers/accumulators */
2281         BNWORD64 *t;            /* Pointer into the working buffers */
2282         BNWORD64 inv;           /* mod^-1 modulo 2^64 */
2283         int y;                  /* bnYield() result */
2284
2285         assert(mlen);
2286         assert(nlen <= mlen);
2287
2288         /* First, a couple of trivial cases. */
2289         elen = lbnNorm_64(e, elen);
2290         if (!elen) {
2291                 /* x ^ 0 == 1 */
2292                 lbnZero_64(result, mlen);
2293                 BIGLITTLE(result[-1],result[0]) = 1;
2294                 return 0;
2295         }
2296         ebits = lbnBits_64(e, elen);
2297         if (ebits == 1) {
2298                 /* x ^ 1 == x */
2299                 if (n != result)
2300                         lbnCopy_64(result, n, nlen);
2301                 if (mlen > nlen)
2302                         lbnZero_64(BIGLITTLE(result-nlen,result+nlen),
2303                                    mlen-nlen);
2304                 return 0;
2305         }
2306
2307         /* Okay, now move the exponent pointer to the most-significant word */
2308         e = BIGLITTLE(e-elen, e+elen-1);
2309
2310         /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2311         wbits = 0;
2312         while (ebits > bnExpModThreshTable[wbits])
2313                 wbits++;
2314
2315         /* Allocate working storage: two product buffers and the tables. */
2316         LBNALLOC(a, BNWORD64, 2*mlen);
2317         if (!a)
2318                 return -1;
2319         LBNALLOC(b, BNWORD64, 2*mlen);
2320         if (!b) {
2321                 LBNFREE(a, 2*mlen);
2322                 return -1;
2323         }
2324
2325         /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2326         tblmask = 1u << wbits;
2327
2328         /* We have the result buffer available, so use it. */
2329         table[0] = result;
2330
2331         /*
2332          * Okay, we now have a minimal-sized table - expand it.
2333          * This is allowed to fail!  If so, scale back the table size
2334          * and proceed.
2335          */
2336         for (i = 1; i < tblmask; i++) {
2337                 LBNALLOC(t, BNWORD64, mlen);
2338                 if (!t) /* Out of memory!  Quit the loop. */
2339                         break;
2340                 table[i] = t;
2341         }
2342
2343         /* If we stopped, with i < tblmask, shrink the tables appropriately */
2344         while (tblmask > i) {
2345                 wbits--;
2346                 tblmask >>= 1;
2347         }
2348         /* Free up our overallocations */
2349         while (--i > tblmask)
2350                 LBNFREE(table[i], mlen);
2351
2352         /* Okay, fill in the table */
2353
2354         /* Compute the necessary modular inverse */
2355         inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);     /* LSW of modulus */
2356
2357         /* Convert n to Montgomery form */
2358
2359         /* Move n up "mlen" words into a */
2360         t = BIGLITTLE(a-mlen, a+mlen);
2361         lbnCopy_64(t, n, nlen);
2362         lbnZero_64(a, mlen);
2363         /* Do the division - lose the quotient into the high-order words */
2364         (void)lbnDiv_64(t, a, mlen+nlen, mod, mlen);
2365         /* Copy into first table entry */
2366         lbnCopy_64(table[0], a, mlen);
2367
2368         /* Square a into b */
2369         lbnMontSquare_64(b, a, mod, mlen, inv);
2370
2371         /* Use high half of b to initialize the table */
2372         t = BIGLITTLE(b-mlen, b+mlen);
2373         for (i = 1; i < tblmask; i++) {
2374                 lbnMontMul_64(a, t, table[i-1], mod, mlen, inv);
2375                 lbnCopy_64(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
2376 #if BNYIELD
2377                 if (bnYield && (y = bnYield()) < 0)
2378                         goto yield;
2379 #endif
2380         }
2381
2382         /* We might use b = n^2 later... */
2383
2384         /* Initialze the fetch pointer */
2385         bitpos = (BNWORD64)1 << ((ebits-1) & (64-1));   /* Initialize mask */
2386
2387         /* This should point to the msbit of e */
2388         assert((*e & bitpos) != 0);
2389
2390         /*
2391          * Pre-load the window.  Becuase the window size is
2392          * never larger than the exponent size, there is no need to
2393          * detect running off the end of e in here.
2394          *
2395          * The read-ahead is controlled by elen and the bitpos mask.
2396          * Note that this is *ahead* of ebits, which tracks the
2397          * most significant end of the window.  The purpose of this
2398          * initialization is to get the two wbits+1 bits apart,
2399          * like they should be.
2400          *
2401          * Note that bitpos and e1len together keep track of the
2402          * lookahead read pointer in the exponent that is used here.
2403          */
2404         buf = 0;
2405         for (i = 0; i <= wbits; i++) {
2406                 buf = (buf << 1) | ((*e & bitpos) != 0);
2407                 bitpos >>= 1;
2408                 if (!bitpos) {
2409                         BIGLITTLE(e++,e--);
2410                         bitpos = (BNWORD64)1 << (64-1);
2411                         elen--;
2412                 }
2413         }
2414         assert(buf & tblmask);
2415
2416         /*
2417          * Set the pending multiply positions to a location that will
2418          * never be encountered, thus ensuring that nothing will happen
2419          * until the need for a multiply appears and one is scheduled.
2420          */
2421         multpos = ebits;        /* A NULL value */
2422         mult = 0;       /* Force a crash if we use these */
2423
2424         /*
2425          * Okay, now begins the real work.  The first step is
2426          * slightly magic, so it's done outside the main loop,
2427          * but it's very similar to what's inside.
2428          */
2429         ebits--;        /* Start processing the first bit... */
2430         isone = 1;
2431
2432         /*
2433          * This is just like the multiply in the loop, except that
2434          * - We know the msbit of buf is set, and
2435          * - We have the extra value n^2 floating around.
2436          * So, do the usual computation, and if the result is that
2437          * the buffer should be multiplied by n^1 immediately
2438          * (which we'd normally then square), we multiply it
2439          * (which reduces to a copy, which reduces to setting a flag)
2440          * by n^2 and skip the squaring.  Thus, we do the
2441          * multiply and the squaring in one step.
2442          */
2443         assert(buf & tblmask);
2444         multpos = ebits - wbits;
2445         while ((buf & 1) == 0) {
2446                 buf >>= 1;
2447                 multpos++;
2448         }
2449         /* Intermediates can wrap, but final must NOT */
2450         assert(multpos <= ebits);
2451         mult = table[buf>>1];
2452         buf = 0;
2453
2454         /* Special case: use already-computed value sitting in buffer */
2455         if (multpos == ebits)
2456                 isone = 0;
2457
2458         /*
2459          * At this point, the buffer (which is the high half of b) holds
2460          * either 1 (implicitly, as the "isone" flag is set), or n^2.
2461          */
2462
2463         /*
2464          * The main loop.  The procedure is:
2465          * - Advance the window
2466          * - If the most-significant bit of the window is set,
2467          *   schedule a multiply for the appropriate time in the
2468          *   future (may be immediately)
2469          * - Perform any pending multiples
2470          * - Check for termination
2471          * - Square the buffer
2472          *
2473          * At any given time, the acumulated product is held in
2474          * the high half of b.
2475          */
2476         for (;;) {
2477                 ebits--;
2478
2479                 /* Advance the window */
2480                 assert(buf < tblmask);
2481                 buf <<= 1;
2482                 /*
2483                  * This reads ahead of the current exponent position
2484                  * (controlled by ebits), so we have to be able to read
2485                  * past the lsb of the exponents without error.
2486                  */
2487                 if (elen) {
2488                         buf |= ((*e & bitpos) != 0);
2489                         bitpos >>= 1;
2490                         if (!bitpos) {
2491                                 BIGLITTLE(e++,e--);
2492                                 bitpos = (BNWORD64)1 << (64-1);
2493                                 elen--;
2494                         }
2495                 }
2496
2497                 /* Examine the window for pending multiplies */
2498                 if (buf & tblmask) {
2499                         multpos = ebits - wbits;
2500                         while ((buf & 1) == 0) {
2501                                 buf >>= 1;
2502                                 multpos++;
2503                         }
2504                         /* Intermediates can wrap, but final must NOT */
2505                         assert(multpos <= ebits);
2506                         mult = table[buf>>1];
2507                         buf = 0;
2508                 }
2509
2510                 /* If we have a pending multiply, do it */
2511                 if (ebits == multpos) {
2512                         /* Multiply by the table entry remembered previously */
2513                         t = BIGLITTLE(b-mlen, b+mlen);
2514                         if (isone) {
2515                                 /* Multiply by 1 is a trivial case */
2516                                 lbnCopy_64(t, mult, mlen);
2517                                 isone = 0;
2518                         } else {
2519                                 lbnMontMul_64(a, t, mult, mod, mlen, inv);
2520                                 /* Swap a and b */
2521                                 t = a; a = b; b = t;
2522                         }
2523                 }
2524
2525                 /* Are we done? */
2526                 if (!ebits)
2527                         break;
2528
2529                 /* Square the input */
2530                 if (!isone) {
2531                         t = BIGLITTLE(b-mlen, b+mlen);
2532                         lbnMontSquare_64(a, t, mod, mlen, inv);
2533                         /* Swap a and b */
2534                         t = a; a = b; b = t;
2535                 }
2536 #if BNYIELD
2537                 if (bnYield && (y = bnYield()) < 0)
2538                         goto yield;
2539 #endif
2540         } /* for (;;) */
2541
2542         assert(!isone);
2543         assert(!buf);
2544
2545         /* DONE! */
2546
2547         /* Convert result out of Montgomery form */
2548         t = BIGLITTLE(b-mlen, b+mlen);
2549         lbnCopy_64(b, t, mlen);
2550         lbnZero_64(t, mlen);
2551         lbnMontReduce_64(b, mod, mlen, inv);
2552         lbnCopy_64(result, t, mlen);
2553         /*
2554          * Clean up - free intermediate storage.
2555          * Do NOT free table[0], which is the result
2556          * buffer.
2557          */
2558         y = 0;
2559 #if BNYIELD
2560 yield:
2561 #endif
2562         while (--tblmask)
2563                 LBNFREE(table[tblmask], mlen);
2564         LBNFREE(b, 2*mlen);
2565         LBNFREE(a, 2*mlen);
2566
2567         return y;       /* Success */
2568 }
2569
2570 /*
2571  * Compute and return n1^e1 * n2^e2 mod "mod".
2572  * result may be either input buffer, or something separate.
2573  * It must be "mlen" words long.
2574  *
2575  * There is a current position in the exponents, which is kept in e1bits.
2576  * (The exponents are swapped if necessary so e1 is the longer of the two.)
2577  * At any given time, the value in the accumulator is
2578  * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
2579  * As e1bits is counted down, this is updated, by squaring it and doing
2580  * any necessary multiplies.
2581  * To decide on the necessary multiplies, two windows, each w1bits+1 bits
2582  * wide, are maintained in buf1 and buf2, which read *ahead* of the
2583  * e1bits position (with appropriate handling of the case when e1bits
2584  * drops below w1bits+1).  When the most-significant bit of either window
2585  * becomes set, indicating that something needs to be multiplied by
2586  * the accumulator or it will get out of sync, the window is examined
2587  * to see which power of n1 or n2 to multiply by, and when (possibly
2588  * later, if the power is greater than 1) the multiply should take
2589  * place.  Then the multiply and its location are remembered and the
2590  * window is cleared.
2591  *
2592  * If we had every power of n1 in the table, the multiply would always
2593  * be w1bits steps in the future.  But we only keep the odd powers,
2594  * so instead of waiting w1bits squarings and then multiplying
2595  * by n1^k, we wait w1bits-k squarings and multiply by n1.
2596  *
2597  * Actually, w2bits can be less than w1bits, but the window is the same
2598  * size, to make it easier to keep track of where we're reading.  The
2599  * appropriate number of low-order bits of the window are just ignored.
2600  */
2601 int
2602 lbnDoubleExpMod_64(BNWORD64 *result,
2603                    BNWORD64 const *n1, unsigned n1len,
2604                    BNWORD64 const *e1, unsigned e1len,
2605                    BNWORD64 const *n2, unsigned n2len,
2606                    BNWORD64 const *e2, unsigned e2len,
2607                    BNWORD64 *mod, unsigned mlen)
2608 {
2609         BNWORD64 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
2610                                         /* Table of odd powers of n1 */
2611         BNWORD64 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
2612                                         /* Table of odd powers of n2 */
2613         unsigned e1bits, e2bits;        /* Exponent bits */
2614         unsigned w1bits, w2bits;        /* Window sizes */
2615         unsigned tblmask;               /* Mask of exponentiation window */
2616         BNWORD64 bitpos;                /* Mask of current look-ahead bit */
2617         unsigned buf1, buf2;            /* Buffer of exponent bits */
2618         unsigned mult1pos, mult2pos;    /* Where to do pending multiply */
2619         BNWORD64 const *mult1, *mult2;  /* What to multiply by */
2620         unsigned i;                     /* Loop counter */
2621         int isone;                      /* Flag: accum. is implicitly one */
2622         BNWORD64 *a, *b;                /* Working buffers/accumulators */
2623         BNWORD64 *t;                    /* Pointer into the working buffers */
2624         BNWORD64 inv;                   /* mod^-1 modulo 2^64 */
2625         int y;                          /* bnYield() result */
2626
2627         assert(mlen);
2628         assert(n1len <= mlen);
2629         assert(n2len <= mlen);
2630
2631         /* First, a couple of trivial cases. */
2632         e1len = lbnNorm_64(e1, e1len);
2633         e2len = lbnNorm_64(e2, e2len);
2634
2635         /* Ensure that the first exponent is the longer */
2636         e1bits = lbnBits_64(e1, e1len);
2637         e2bits = lbnBits_64(e2, e2len);
2638         if (e1bits < e2bits) {
2639                 i = e1len; e1len = e2len; e2len = i;
2640                 i = e1bits; e1bits = e2bits; e2bits = i;
2641                 t = (BNWORD64 *)n1; n1 = n2; n2 = t;
2642                 t = (BNWORD64 *)e1; e1 = e2; e2 = t;
2643         }
2644         assert(e1bits >= e2bits);
2645
2646         /* Handle a trivial case */
2647         if (!e2len)
2648                 return lbnExpMod_64(result, n1, n1len, e1, e1len, mod, mlen);
2649         assert(e2bits);
2650
2651         /* The code below fucks up if the exponents aren't at least 2 bits */
2652         if (e1bits == 1) {
2653                 assert(e2bits == 1);
2654
2655                 LBNALLOC(a, BNWORD64, n1len+n2len);
2656                 if (!a)
2657                         return -1;
2658
2659                 lbnMul_64(a, n1, n1len, n2, n2len);
2660                 /* Do a direct modular reduction */
2661                 if (n1len + n2len >= mlen)
2662                         (void)lbnDiv_64(a+mlen, a, n1len+n2len, mod, mlen);
2663                 lbnCopy_64(result, a, mlen);
2664                 LBNFREE(a, n1len+n2len);
2665                 return 0;
2666         }
2667
2668         /* Okay, now move the exponent pointers to the most-significant word */
2669         e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
2670         e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
2671
2672         /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2673         w1bits = 0;
2674         while (e1bits > bnExpModThreshTable[w1bits])
2675                 w1bits++;
2676         w2bits = 0;
2677         while (e2bits > bnExpModThreshTable[w2bits])
2678                 w2bits++;
2679
2680         assert(w1bits >= w2bits);
2681
2682         /* Allocate working storage: two product buffers and the tables. */
2683         LBNALLOC(a, BNWORD64, 2*mlen);
2684         if (!a)
2685                 return -1;
2686         LBNALLOC(b, BNWORD64, 2*mlen);
2687         if (!b) {
2688                 LBNFREE(a, 2*mlen);
2689                 return -1;
2690         }
2691
2692         /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2693         tblmask = 1u << w1bits;
2694         /* Use buf2 for its size, temporarily */
2695         buf2 = 1u << w2bits;
2696
2697         LBNALLOC(t, BNWORD64, mlen);
2698         if (!t) {
2699                 LBNFREE(b, 2*mlen);
2700                 LBNFREE(a, 2*mlen);
2701                 return -1;
2702         }
2703         table1[0] = t;
2704         table2[0] = result;
2705
2706         /*
2707          * Okay, we now have some minimal-sized tables - expand them.
2708          * This is allowed to fail!  If so, scale back the table sizes
2709          * and proceed.  We allocate both tables at the same time
2710          * so if it fails partway through, they'll both be a reasonable
2711          * size rather than one huge and one tiny.
2712          * When i passes buf2 (the number of entries in the e2 window,
2713          * which may be less than the number of entries in the e1 window),
2714          * stop allocating e2 space.
2715          */
2716         for (i = 1; i < tblmask; i++) {
2717                 LBNALLOC(t, BNWORD64, mlen);
2718                 if (!t) /* Out of memory!  Quit the loop. */
2719                         break;
2720                 table1[i] = t;
2721                 if (i < buf2) {
2722                         LBNALLOC(t, BNWORD64, mlen);
2723                         if (!t) {
2724                                 LBNFREE(table1[i], mlen);
2725                                 break;
2726                         }
2727                         table2[i] = t;
2728                 }
2729         }
2730
2731         /* If we stopped, with i < tblmask, shrink the tables appropriately */
2732         while (tblmask > i) {
2733                 w1bits--;
2734                 tblmask >>= 1;
2735         }
2736         /* Free up our overallocations */
2737         while (--i > tblmask) {
2738                 if (i < buf2)
2739                         LBNFREE(table2[i], mlen);
2740                 LBNFREE(table1[i], mlen);
2741         }
2742         /* And shrink the second window too, if needed */
2743         if (w2bits > w1bits) {
2744                 w2bits = w1bits;
2745                 buf2 = tblmask;
2746         }
2747
2748         /*
2749          * From now on, use the w2bits variable for the difference
2750          * between w1bits and w2bits.
2751          */
2752         w2bits = w1bits-w2bits;
2753
2754         /* Okay, fill in the tables */
2755
2756         /* Compute the necessary modular inverse */
2757         inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);     /* LSW of modulus */
2758
2759         /* Convert n1 to Montgomery form */
2760
2761         /* Move n1 up "mlen" words into a */
2762         t = BIGLITTLE(a-mlen, a+mlen);
2763         lbnCopy_64(t, n1, n1len);
2764         lbnZero_64(a, mlen);
2765         /* Do the division - lose the quotient into the high-order words */
2766         (void)lbnDiv_64(t, a, mlen+n1len, mod, mlen);
2767         /* Copy into first table entry */
2768         lbnCopy_64(table1[0], a, mlen);
2769
2770         /* Square a into b */
2771         lbnMontSquare_64(b, a, mod, mlen, inv);
2772
2773         /* Use high half of b to initialize the first table */
2774         t = BIGLITTLE(b-mlen, b+mlen);
2775         for (i = 1; i < tblmask; i++) {
2776                 lbnMontMul_64(a, t, table1[i-1], mod, mlen, inv);
2777                 lbnCopy_64(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
2778 #if BNYIELD
2779                 if (bnYield && (y = bnYield()) < 0)
2780                         goto yield;
2781 #endif
2782         }
2783
2784         /* Convert n2 to Montgomery form */
2785
2786         t = BIGLITTLE(a-mlen, a+mlen);
2787         /* Move n2 up "mlen" words into a */
2788         lbnCopy_64(t, n2, n2len);
2789         lbnZero_64(a, mlen);
2790         /* Do the division - lose the quotient into the high-order words */
2791         (void)lbnDiv_64(t, a, mlen+n2len, mod, mlen);
2792         /* Copy into first table entry */
2793         lbnCopy_64(table2[0], a, mlen);
2794
2795         /* Square it into a */
2796         lbnMontSquare_64(a, table2[0], mod, mlen, inv);
2797         /* Copy to b, low half */
2798         lbnCopy_64(b, t, mlen);
2799
2800         /* Use b to initialize the second table */
2801         for (i = 1; i < buf2; i++) {
2802                 lbnMontMul_64(a, b, table2[i-1], mod, mlen, inv);
2803                 lbnCopy_64(table2[i], t, mlen);
2804 #if BNYIELD
2805                 if (bnYield && (y = bnYield()) < 0)
2806                         goto yield;
2807 #endif
2808         }
2809
2810         /*
2811          * Okay, a recap: at this point, the low part of b holds
2812          * n2^2, the high part holds n1^2, and the tables are
2813          * initialized with the odd powers of n1 and n2 from 1
2814          * through 2*tblmask-1 and 2*buf2-1.
2815          *
2816          * We might use those squares in b later, or we might not.
2817          */
2818
2819         /* Initialze the fetch pointer */
2820         bitpos = (BNWORD64)1 << ((e1bits-1) & (64-1));  /* Initialize mask */
2821
2822         /* This should point to the msbit of e1 */
2823         assert((*e1 & bitpos) != 0);
2824
2825         /*
2826          * Pre-load the windows.  Becuase the window size is
2827          * never larger than the exponent size, there is no need to
2828          * detect running off the end of e1 in here.
2829          *
2830          * The read-ahead is controlled by e1len and the bitpos mask.
2831          * Note that this is *ahead* of e1bits, which tracks the
2832          * most significant end of the window.  The purpose of this
2833          * initialization is to get the two w1bits+1 bits apart,
2834          * like they should be.
2835          *
2836          * Note that bitpos and e1len together keep track of the
2837          * lookahead read pointer in the exponent that is used here.
2838          * e2len is not decremented, it is only ever compared with
2839          * e1len as *that* is decremented.
2840          */
2841         buf1 = buf2 = 0;
2842         for (i = 0; i <= w1bits; i++) {
2843                 buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
2844                 if (e1len <= e2len)
2845                         buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
2846                 bitpos >>= 1;
2847                 if (!bitpos) {
2848                         BIGLITTLE(e1++,e1--);
2849                         if (e1len <= e2len)
2850                                 BIGLITTLE(e2++,e2--);
2851                         bitpos = (BNWORD64)1 << (64-1);
2852                         e1len--;
2853                 }
2854         }
2855         assert(buf1 & tblmask);
2856
2857         /*
2858          * Set the pending multiply positions to a location that will
2859          * never be encountered, thus ensuring that nothing will happen
2860          * until the need for a multiply appears and one is scheduled.
2861          */
2862         mult1pos = mult2pos = e1bits;   /* A NULL value */
2863         mult1 = mult2 = 0;      /* Force a crash if we use these */
2864
2865         /*
2866          * Okay, now begins the real work.  The first step is
2867          * slightly magic, so it's done outside the main loop,
2868          * but it's very similar to what's inside.
2869          */
2870         isone = 1;      /* Buffer is implicitly 1, so replace * by copy */
2871         e1bits--;       /* Start processing the first bit... */
2872
2873         /*
2874          * This is just like the multiply in the loop, except that
2875          * - We know the msbit of buf1 is set, and
2876          * - We have the extra value n1^2 floating around.
2877          * So, do the usual computation, and if the result is that
2878          * the buffer should be multiplied by n1^1 immediately
2879          * (which we'd normally then square), we multiply it
2880          * (which reduces to a copy, which reduces to setting a flag)
2881          * by n1^2 and skip the squaring.  Thus, we do the
2882          * multiply and the squaring in one step.
2883          */
2884         assert(buf1 & tblmask);
2885         mult1pos = e1bits - w1bits;
2886         while ((buf1 & 1) == 0) {
2887                 buf1 >>= 1;
2888                 mult1pos++;
2889         }
2890         /* Intermediates can wrap, but final must NOT */
2891         assert(mult1pos <= e1bits);
2892         mult1 = table1[buf1>>1];
2893         buf1 = 0;
2894
2895         /* Special case: use already-computed value sitting in buffer */
2896         if (mult1pos == e1bits)
2897                 isone = 0;
2898
2899         /*
2900          * The first multiply by a power of n2.  Similar, but
2901          * we might not even want to schedule a multiply if e2 is
2902          * shorter than e1, and the window might be shorter so
2903          * we have to leave the low w2bits bits alone.
2904          */
2905         if (buf2 & tblmask) {
2906                 /* Remember low-order bits for later */
2907                 i = buf2 & ((1u << w2bits) - 1);
2908                 buf2 >>= w2bits;
2909                 mult2pos = e1bits - w1bits + w2bits;
2910                 while ((buf2 & 1) == 0) {
2911                         buf2 >>= 1;
2912                         mult2pos++;
2913                 }
2914                 assert(mult2pos <= e1bits);
2915                 mult2 = table2[buf2>>1];
2916                 buf2 = i;
2917
2918                 if (mult2pos == e1bits) {
2919                         t = BIGLITTLE(b-mlen, b+mlen);
2920                         if (isone) {
2921                                 lbnCopy_64(t, b, mlen); /* Copy low to high */
2922                                 isone = 0;
2923                         } else {
2924                                 lbnMontMul_64(a, t, b, mod, mlen, inv);
2925                                 t = a; a = b; b = t;
2926                         }
2927                 }
2928         }
2929
2930         /*
2931          * At this point, the buffer (which is the high half of b)
2932          * holds either 1 (implicitly, as the "isone" flag is set),
2933          * n1^2, n2^2 or n1^2 * n2^2.
2934          */
2935
2936         /*
2937          * The main loop.  The procedure is:
2938          * - Advance the windows
2939          * - If the most-significant bit of a window is set,
2940          *   schedule a multiply for the appropriate time in the
2941          *   future (may be immediately)
2942          * - Perform any pending multiples
2943          * - Check for termination
2944          * - Square the buffers
2945          *
2946          * At any given time, the acumulated product is held in
2947          * the high half of b.
2948          */
2949         for (;;) {
2950                 e1bits--;
2951
2952                 /* Advance the windows */
2953                 assert(buf1 < tblmask);
2954                 buf1 <<= 1;
2955                 assert(buf2 < tblmask);
2956                 buf2 <<= 1;
2957                 /*
2958                  * This reads ahead of the current exponent position
2959                  * (controlled by e1bits), so we have to be able to read
2960                  * past the lsb of the exponents without error.
2961                  */
2962                 if (e1len) {
2963                         buf1 |= ((*e1 & bitpos) != 0);
2964                         if (e1len <= e2len)
2965                                 buf2 |= ((*e2 & bitpos) != 0);
2966                         bitpos >>= 1;
2967                         if (!bitpos) {
2968                                 BIGLITTLE(e1++,e1--);
2969                                 if (e1len <= e2len)
2970                                         BIGLITTLE(e2++,e2--);
2971                                 bitpos = (BNWORD64)1 << (64-1);
2972                                 e1len--;
2973                         }
2974                 }
2975
2976                 /* Examine the first window for pending multiplies */
2977                 if (buf1 & tblmask) {
2978                         mult1pos = e1bits - w1bits;
2979                         while ((buf1 & 1) == 0) {
2980                                 buf1 >>= 1;
2981                                 mult1pos++;
2982                         }
2983                         /* Intermediates can wrap, but final must NOT */
2984                         assert(mult1pos <= e1bits);
2985                         mult1 = table1[buf1>>1];
2986                         buf1 = 0;
2987                 }
2988
2989                 /*
2990                  * Examine the second window for pending multiplies.
2991                  * Window 2 can be smaller than window 1, but we
2992                  * keep the same number of bits in buf2, so we need
2993                  * to ignore any low-order bits in the buffer when
2994                  * computing what to multiply by, and recompute them
2995                  * later.
2996                  */
2997                 if (buf2 & tblmask) {
2998                         /* Remember low-order bits for later */
2999                         i = buf2 & ((1u << w2bits) - 1);
3000                         buf2 >>= w2bits;
3001                         mult2pos = e1bits - w1bits + w2bits;
3002                         while ((buf2 & 1) == 0) {
3003                                 buf2 >>= 1;
3004                                 mult2pos++;
3005                         }
3006                         assert(mult2pos <= e1bits);
3007                         mult2 = table2[buf2>>1];
3008                         buf2 = i;
3009                 }
3010
3011
3012                 /* If we have a pending multiply for e1, do it */
3013                 if (e1bits == mult1pos) {
3014                         /* Multiply by the table entry remembered previously */
3015                         t = BIGLITTLE(b-mlen, b+mlen);
3016                         if (isone) {
3017                                 /* Multiply by 1 is a trivial case */
3018                                 lbnCopy_64(t, mult1, mlen);
3019                                 isone = 0;
3020                         } else {
3021                                 lbnMontMul_64(a, t, mult1, mod, mlen, inv);
3022                                 /* Swap a and b */
3023                                 t = a; a = b; b = t;
3024                         }
3025                 }
3026
3027                 /* If we have a pending multiply for e2, do it */
3028                 if (e1bits == mult2pos) {
3029                         /* Multiply by the table entry remembered previously */
3030                         t = BIGLITTLE(b-mlen, b+mlen);
3031                         if (isone) {
3032                                 /* Multiply by 1 is a trivial case */
3033                                 lbnCopy_64(t, mult2, mlen);
3034                                 isone = 0;
3035                         } else {
3036                                 lbnMontMul_64(a, t, mult2, mod, mlen, inv);
3037                                 /* Swap a and b */
3038                                 t = a; a = b; b = t;
3039                         }
3040                 }
3041
3042                 /* Are we done? */
3043                 if (!e1bits)
3044                         break;
3045
3046                 /* Square the buffer */
3047                 if (!isone) {
3048                         t = BIGLITTLE(b-mlen, b+mlen);
3049                         lbnMontSquare_64(a, t, mod, mlen, inv);
3050                         /* Swap a and b */
3051                         t = a; a = b; b = t;
3052                 }
3053 #if BNYIELD
3054                 if (bnYield && (y = bnYield()) < 0)
3055                         goto yield;
3056 #endif
3057         } /* for (;;) */
3058
3059         assert(!isone);
3060         assert(!buf1);
3061         assert(!buf2);
3062
3063         /* DONE! */
3064
3065         /* Convert result out of Montgomery form */
3066         t = BIGLITTLE(b-mlen, b+mlen);
3067         lbnCopy_64(b, t, mlen);
3068         lbnZero_64(t, mlen);
3069         lbnMontReduce_64(b, mod, mlen, inv);
3070         lbnCopy_64(result, t, mlen);
3071
3072         /* Clean up - free intermediate storage */
3073         y = 0;
3074 #if BNYIELD
3075 yield:
3076 #endif
3077         buf2 = tblmask >> w2bits;
3078         while (--tblmask) {
3079                 if (tblmask < buf2)
3080                         LBNFREE(table2[tblmask], mlen);
3081                 LBNFREE(table1[tblmask], mlen);
3082         }
3083         t = table1[0];
3084         LBNFREE(t, mlen);
3085         LBNFREE(b, 2*mlen);
3086         LBNFREE(a, 2*mlen);
3087
3088         return y;       /* Success */
3089 }
3090
3091 /*
3092  * 2^exp (mod mod).  This is an optimized version for use in Fermat
3093  * tests.  The input value of n is ignored; it is returned with
3094  * "mlen" words valid.
3095  */
3096 int
3097 lbnTwoExpMod_64(BNWORD64 *n, BNWORD64 const *exp, unsigned elen,
3098         BNWORD64 *mod, unsigned mlen)
3099 {
3100         unsigned e;     /* Copy of high words of the exponent */
3101         unsigned bits;  /* Assorted counter of bits */
3102         BNWORD64 const *bitptr;
3103         BNWORD64 bitword, bitpos;
3104         BNWORD64 *a, *b, *a1;
3105         BNWORD64 inv;
3106         int y;          /* Result of bnYield() */
3107
3108         assert(mlen);
3109
3110         bitptr = BIGLITTLE(exp-elen, exp+elen-1);
3111         bitword = *bitptr;
3112         assert(bitword);
3113
3114         /* Clear n for future use. */
3115         lbnZero_64(n, mlen);
3116
3117         bits = lbnBits_64(exp, elen);
3118
3119         /* First, a couple of trivial cases. */
3120         if (bits <= 1) {
3121                 /* 2 ^ 0 == 1,  2 ^ 1 == 2 */
3122                 BIGLITTLE(n[-1],n[0]) = (BNWORD64)1<<elen;
3123                 return 0;
3124         }
3125
3126         /* Set bitpos to the most significant bit */
3127         bitpos = (BNWORD64)1 << ((bits-1) & (64-1));
3128
3129         /* Now, count the bits in the modulus. */
3130         bits = lbnBits_64(mod, mlen);
3131         assert(bits > 1);       /* a 1-bit modulus is just stupid... */
3132
3133         /*
3134          * We start with 1<<e, where "e" is as many high bits of the
3135          * exponent as we can manage without going over the modulus.
3136          * This first loop finds "e".
3137          */
3138         e = 1;
3139         while (elen) {
3140                 /* Consume the first bit */
3141                 bitpos >>= 1;
3142                 if (!bitpos) {
3143                         if (!--elen)
3144                                 break;
3145                         bitword = BIGLITTLE(*++bitptr,*--bitptr);
3146                         bitpos = (BNWORD64)1<<(64-1);
3147                 }
3148                 e = (e << 1) | ((bitpos & bitword) != 0);
3149                 if (e >= bits) {        /* Overflow!  Back out. */
3150                         e >>= 1;
3151                         break;
3152                 }
3153         }
3154         /*
3155          * The bit in "bitpos" being examined by the bit buffer has NOT
3156          * been consumed yet.  This may be past the end of the exponent,
3157          * in which case elen == 1.
3158          */
3159
3160         /* Okay, now, set bit "e" in n.  n is already zero. */
3161         inv = (BNWORD64)1 << (e & (64-1));
3162         e /= 64;
3163         BIGLITTLE(n[-e-1],n[e]) = inv;
3164         /*
3165          * The effective length of n in words is now "e+1".
3166          * This is used a little bit later.
3167          */
3168
3169         if (!elen)
3170                 return 0;       /* That was easy! */
3171
3172         /*
3173          * We have now processed the first few bits.  The next step
3174          * is to convert this to Montgomery form for further squaring.
3175          */
3176
3177         /* Allocate working storage: two product buffers */
3178         LBNALLOC(a, BNWORD64, 2*mlen);
3179         if (!a)
3180                 return -1;
3181         LBNALLOC(b, BNWORD64, 2*mlen);
3182         if (!b) {
3183                 LBNFREE(a, 2*mlen);
3184                 return -1;
3185         }
3186
3187         /* Convert n to Montgomery form */
3188         inv = BIGLITTLE(mod[-1],mod[0]);        /* LSW of modulus */
3189         assert(inv & 1);        /* Modulus must be odd */
3190         inv = lbnMontInv1_64(inv);
3191         /* Move n (length e+1, remember?) up "mlen" words into b */
3192         /* Note that we lie about a1 for a bit - it's pointing to b */
3193         a1 = BIGLITTLE(b-mlen,b+mlen);
3194         lbnCopy_64(a1, n, e+1);
3195         lbnZero_64(b, mlen);
3196         /* Do the division - dump the quotient into the high-order words */
3197         (void)lbnDiv_64(a1, b, mlen+e+1, mod, mlen);
3198         /*
3199          * Now do the first squaring and modular reduction to put
3200          * the number up in a1 where it belongs.
3201          */
3202         lbnMontSquare_64(a, b, mod, mlen, inv);
3203         /* Fix up a1 to point to where it should go. */
3204         a1 = BIGLITTLE(a-mlen,a+mlen);
3205
3206         /*
3207          * Okay, now, a1 holds the number being accumulated, and
3208          * b is a scratch register.  Start working:
3209          */
3210         for (;;) {
3211                 /*
3212                  * Is the bit set?  If so, double a1 as well.
3213                  * A modular doubling like this is very cheap.
3214                  */
3215                 if (bitpos & bitword) {
3216                         /*
3217                          * Double the number.  If there was a carry out OR
3218                          * the result is greater than the modulus, subract
3219                          * the modulus.
3220                          */
3221                         if (lbnDouble_64(a1, mlen) ||
3222                             lbnCmp_64(a1, mod, mlen) > 0)
3223                                 (void)lbnSubN_64(a1, mod, mlen);
3224                 }
3225
3226                 /* Advance to the next exponent bit */
3227                 bitpos >>= 1;
3228                 if (!bitpos) {
3229                         if (!--elen)
3230                                 break;  /* Done! */
3231                         bitword = BIGLITTLE(*++bitptr,*--bitptr);
3232                         bitpos = (BNWORD64)1<<(64-1);
3233                 }
3234
3235                 /*
3236                  * The elen/bitword/bitpos bit buffer is known to be
3237                  * non-empty, i.e. there is at least one more unconsumed bit.
3238                  * Thus, it's safe to square the number.
3239                  */
3240                 lbnMontSquare_64(b, a1, mod, mlen, inv);
3241                 /* Rename result (in b) back to a (a1, really). */
3242                 a1 = b; b = a; a = a1;
3243                 a1 = BIGLITTLE(a-mlen,a+mlen);
3244 #if BNYIELD
3245                 if (bnYield && (y = bnYield()) < 0)
3246                         goto yield;
3247 #endif
3248         }
3249
3250         /* DONE!  Just a little bit of cleanup... */
3251
3252         /*
3253          * Convert result out of Montgomery form... this is
3254          * just a Montgomery reduction.
3255          */
3256         lbnCopy_64(a, a1, mlen);
3257         lbnZero_64(a1, mlen);
3258         lbnMontReduce_64(a, mod, mlen, inv);
3259         lbnCopy_64(n, a1, mlen);
3260
3261         /* Clean up - free intermediate storage */
3262         y = 0;
3263 #if BNYIELD
3264 yield:
3265 #endif
3266         LBNFREE(b, 2*mlen);
3267         LBNFREE(a, 2*mlen);
3268
3269         return y;       /* Success */
3270 }
3271
3272
3273 /*
3274  * Returns a substring of the big-endian array of bytes representation
3275  * of the bignum array based on two parameters, the least significant
3276  * byte number (0 to start with the least significant byte) and the
3277  * length.  I.e. the number returned is a representation of
3278  * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3279  *
3280  * It is an error if the bignum is not at least buflen + lsbyte bytes
3281  * long.
3282  *
3283  * This code assumes that the compiler has the minimal intelligence
3284  * neded to optimize divides and modulo operations on an unsigned data
3285  * type with a power of two.
3286  */
3287 void
3288 lbnExtractBigBytes_64(BNWORD64 const *n, unsigned char *buf,
3289         unsigned lsbyte, unsigned buflen)
3290 {
3291         BNWORD64 t = 0; /* Needed to shut up uninitialized var warnings */
3292         unsigned shift;
3293
3294         lsbyte += buflen;
3295
3296         shift = (8 * lsbyte) % 64;
3297         lsbyte /= (64/8);       /* Convert to word offset */
3298         BIGLITTLE(n -= lsbyte, n += lsbyte);
3299
3300         if (shift)
3301                 t = BIGLITTLE(n[-1],n[0]);
3302
3303         while (buflen--) {
3304                 if (!shift) {
3305                         t = BIGLITTLE(*n++,*--n);
3306                         shift = 64;
3307                 }
3308                 shift -= 8;
3309                 *buf++ = (unsigned char)(t>>shift);
3310         }
3311 }
3312
3313 /*
3314  * Merge a big-endian array of bytes into a bignum array.
3315  * The array had better be big enough.  This is
3316  * equivalent to extracting the entire bignum into a
3317  * large byte array, copying the input buffer into the
3318  * middle of it, and converting back to a bignum.
3319  *
3320  * The buf is "len" bytes long, and its *last* byte is at
3321  * position "lsbyte" from the end of the bignum.
3322  *
3323  * Note that this is a pain to get right.  Fortunately, it's hardly
3324  * critical for efficiency.
3325  */
3326 void
3327 lbnInsertBigBytes_64(BNWORD64 *n, unsigned char const *buf,
3328                   unsigned lsbyte,  unsigned buflen)
3329 {
3330         BNWORD64 t = 0; /* Shut up uninitialized varibale warnings */
3331
3332         lsbyte += buflen;
3333
3334         BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
3335
3336         /* Load up leading odd bytes */
3337         if (lsbyte % (64/8)) {
3338                 t = BIGLITTLE(*--n,*n++);
3339                 t >>= (lsbyte * 8) % 64;
3340         }
3341
3342         /* The main loop - merge into t, storing at each word boundary. */
3343         while (buflen--) {
3344                 t = (t << 8) | *buf++;
3345                 if ((--lsbyte % (64/8)) == 0)
3346                         BIGLITTLE(*n++,*--n) = t;
3347         }
3348
3349         /* Merge odd bytes in t into last word */
3350         lsbyte = (lsbyte * 8) % 64;
3351         if (lsbyte) {
3352                 t <<= lsbyte;
3353                 t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
3354                 BIGLITTLE(n[0],n[-1]) = t;
3355         }
3356
3357         return;
3358 }
3359
3360 /*
3361  * Returns a substring of the little-endian array of bytes representation
3362  * of the bignum array based on two parameters, the least significant
3363  * byte number (0 to start with the least significant byte) and the
3364  * length.  I.e. the number returned is a representation of
3365  * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3366  *
3367  * It is an error if the bignum is not at least buflen + lsbyte bytes
3368  * long.
3369  *
3370  * This code assumes that the compiler has the minimal intelligence
3371  * neded to optimize divides and modulo operations on an unsigned data
3372  * type with a power of two.
3373  */
3374 void
3375 lbnExtractLittleBytes_64(BNWORD64 const *n, unsigned char *buf,
3376         unsigned lsbyte, unsigned buflen)
3377 {
3378         BNWORD64 t = 0; /* Needed to shut up uninitialized var warnings */
3379
3380         BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
3381
3382         if (lsbyte % (64/8)) {
3383                 t = BIGLITTLE(*--n,*n++);
3384                 t >>= (lsbyte % (64/8)) * 8 ;
3385         }
3386
3387         while (buflen--) {
3388                 if ((lsbyte++ % (64/8)) == 0)
3389                         t = BIGLITTLE(*--n,*n++);
3390                 *buf++ = (unsigned char)t;
3391                 t >>= 8;
3392         }
3393 }
3394
3395 /*
3396  * Merge a little-endian array of bytes into a bignum array.
3397  * The array had better be big enough.  This is
3398  * equivalent to extracting the entire bignum into a
3399  * large byte array, copying the input buffer into the
3400  * middle of it, and converting back to a bignum.
3401  *
3402  * The buf is "len" bytes long, and its first byte is at
3403  * position "lsbyte" from the end of the bignum.
3404  *
3405  * Note that this is a pain to get right.  Fortunately, it's hardly
3406  * critical for efficiency.
3407  */
3408 void
3409 lbnInsertLittleBytes_64(BNWORD64 *n, unsigned char const *buf,
3410                   unsigned lsbyte,  unsigned buflen)
3411 {
3412         BNWORD64 t = 0; /* Shut up uninitialized varibale warnings */
3413
3414         /* Move to most-significant end */
3415         lsbyte += buflen;
3416         buf += buflen;
3417
3418         BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
3419
3420         /* Load up leading odd bytes */
3421         if (lsbyte % (64/8)) {
3422                 t = BIGLITTLE(*--n,*n++);
3423                 t >>= (lsbyte * 8) % 64;
3424         }
3425
3426         /* The main loop - merge into t, storing at each word boundary. */
3427         while (buflen--) {
3428                 t = (t << 8) | *--buf;
3429                 if ((--lsbyte % (64/8)) == 0)
3430                         BIGLITTLE(*n++,*--n) = t;
3431         }
3432
3433         /* Merge odd bytes in t into last word */
3434         lsbyte = (lsbyte * 8) % 64;
3435         if (lsbyte) {
3436                 t <<= lsbyte;
3437                 t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
3438                 BIGLITTLE(n[0],n[-1]) = t;
3439         }
3440
3441         return;
3442 }
3443
3444 #ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
3445 /*
3446  * Convert a big-endian array of bytes to a bignum.
3447  * Returns the number of words in the bignum.
3448  * Note the expression "64/8" for the number of bytes per word.
3449  * This is so the word-size adjustment will work.
3450  */
3451 unsigned
3452 lbnFromBytes_64(BNWORD64 *a, unsigned char const *b, unsigned blen)
3453 {
3454         BNWORD64 t;
3455         unsigned alen = (blen + (64/8-1))/(64/8);
3456         BIGLITTLE(a -= alen, a += alen);
3457
3458         while (blen) {
3459                 t = 0;
3460                 do {
3461                         t = t << 8 | *b++;
3462                 } while (--blen & (64/8-1));
3463                 BIGLITTLE(*a++,*--a) = t;
3464         }
3465         return alen;
3466 }
3467 #endif
3468
3469 /*
3470  * Computes the GCD of a and b.  Modifies both arguments; when it returns,
3471  * one of them is the GCD and the other is trash.  The return value
3472  * indicates which: 0 for a, and 1 for b.  The length of the retult is
3473  * returned in rlen.  Both inputs must have one extra word of precision.
3474  * alen must be >= blen.
3475  *
3476  * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
3477  * This is based on taking out common powers of 2, then repeatedly:
3478  * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
3479  * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
3480  * It gets less reduction per step, but the steps are much faster than
3481  * the division case.
3482  */
3483 int
3484 lbnGcd_64(BNWORD64 *a, unsigned alen, BNWORD64 *b, unsigned blen,
3485         unsigned *rlen)
3486 {
3487 #if BNYIELD
3488         int y;
3489 #endif
3490         assert(alen >= blen);
3491
3492         while (blen != 0) {
3493                 (void)lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
3494                 alen = lbnNorm_64(a, blen);
3495                 if (alen == 0) {
3496                         *rlen = blen;
3497                         return 1;
3498                 }
3499                 (void)lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
3500                 blen = lbnNorm_64(b, alen);
3501 #if BNYIELD
3502                 if (bnYield && (y = bnYield()) < 0)
3503                         return y;
3504 #endif
3505         }
3506         *rlen = alen;
3507         return 0;
3508 }
3509
3510 /*
3511  * Invert "a" modulo "mod" using the extended Euclidean algorithm.
3512  * Note that this only computes one of the cosequences, and uses the
3513  * theorem that the signs flip every step and the absolute value of
3514  * the cosequence values are always bounded by the modulus to avoid
3515  * having to work with negative numbers.
3516  * gcd(a,mod) had better equal 1.  Returns 1 if the GCD is NOT 1.
3517  * a must be one word longer than "mod".  It is overwritten with the
3518  * result.
3519  * TODO: Use Richard Schroeppel's *much* faster algorithm.
3520  */
3521 int
3522 lbnInv_64(BNWORD64 *a, unsigned alen, BNWORD64 const *mod, unsigned mlen)
3523 {
3524         BNWORD64 *b;    /* Hold a copy of mod during GCD reduction */
3525         BNWORD64 *p;    /* Temporary for products added to t0 and t1 */
3526         BNWORD64 *t0, *t1;      /* Inverse accumulators */
3527         BNWORD64 cy;
3528         unsigned blen, t0len, t1len, plen;
3529         int y;
3530
3531         alen = lbnNorm_64(a, alen);
3532         if (!alen)
3533                 return 1;       /* No inverse */
3534
3535         mlen = lbnNorm_64(mod, mlen);
3536
3537         assert (alen <= mlen);
3538
3539         /* Inverse of 1 is 1 */
3540         if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
3541                 lbnZero_64(BIGLITTLE(a-alen,a+alen), mlen-alen);
3542                 return 0;
3543         }
3544
3545         /* Allocate a pile of space */
3546         LBNALLOC(b, BNWORD64, mlen+1);
3547         if (b) {
3548                 /*
3549                  * Although products are guaranteed to always be less than the
3550                  * modulus, it can involve multiplying two 3-word numbers to
3551                  * get a 5-word result, requiring a 6th word to store a 0
3552                  * temporarily.  Thus, mlen + 1.
3553                  */
3554                 LBNALLOC(p, BNWORD64, mlen+1);
3555                 if (p) {
3556                         LBNALLOC(t0, BNWORD64, mlen);
3557                         if (t0) {
3558                                 LBNALLOC(t1, BNWORD64, mlen);
3559                                 if (t1)
3560                                                 goto allocated;
3561                                 LBNFREE(t0, mlen);
3562                         }
3563                         LBNFREE(p, mlen+1);
3564                 }
3565                 LBNFREE(b, mlen+1);
3566         }
3567         return -1;
3568
3569 allocated:
3570
3571         /* Set t0 to 1 */
3572         t0len = 1;
3573         BIGLITTLE(t0[-1],t0[0]) = 1;
3574
3575         /* b = mod */
3576         lbnCopy_64(b, mod, mlen);
3577         /* blen = mlen (implicitly) */
3578
3579         /* t1 = b / a; b = b % a */
3580         cy = lbnDiv_64(t1, b, mlen, a, alen);
3581         *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
3582         t1len = lbnNorm_64(t1, mlen-alen+1);
3583         blen = lbnNorm_64(b, alen);
3584
3585         /* while (b > 1) */
3586         while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD64)1) {
3587                 /* q = a / b; a = a % b; */
3588                 if (alen < blen || (alen == blen && lbnCmp_64(a, a, alen) < 0))
3589                         assert(0);
3590                 cy = lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
3591                 *(BIGLITTLE(a-alen-1,a+alen)) = cy;
3592                 plen = lbnNorm_64(BIGLITTLE(a-blen,a+blen), alen-blen+1);
3593                 assert(plen);
3594                 alen = lbnNorm_64(a, blen);
3595                 if (!alen)
3596                         goto failure;   /* GCD not 1 */
3597
3598                 /* t0 += q * t1; */
3599                 assert(plen+t1len <= mlen+1);
3600                 lbnMul_64(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
3601                 plen = lbnNorm_64(p, plen + t1len);
3602                 assert(plen <= mlen);
3603                 if (plen > t0len) {
3604                         lbnZero_64(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
3605                         t0len = plen;
3606                 }
3607                 cy = lbnAddN_64(t0, p, plen);
3608                 if (cy) {
3609                         if (t0len > plen) {
3610                                 cy = lbnAdd1_64(BIGLITTLE(t0-plen,t0+plen),
3611                                                 t0len-plen, cy);
3612                         }
3613                         if (cy) {
3614                                 BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
3615                                 t0len++;
3616                         }
3617                 }
3618
3619                 /* if (a <= 1) return a ? t0 : FAIL; */
3620                 if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD64)1) {
3621                         if (alen == 0)
3622                                 goto failure;   /* FAIL */
3623                         assert(t0len <= mlen);
3624                         lbnCopy_64(a, t0, t0len);
3625                         lbnZero_64(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
3626                         goto success;
3627                 }
3628
3629                 /* q = b / a; b = b % a; */
3630                 if (blen < alen || (blen == alen && lbnCmp_64(b, a, alen) < 0))
3631                         assert(0);
3632                 cy = lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
3633                 *(BIGLITTLE(b-blen-1,b+blen)) = cy;
3634                 plen = lbnNorm_64(BIGLITTLE(b-alen,b+alen), blen-alen+1);
3635                 assert(plen);
3636                 blen = lbnNorm_64(b, alen);
3637                 if (!blen)
3638                         goto failure;   /* GCD not 1 */
3639
3640                 /* t1 += q * t0; */
3641                 assert(plen+t0len <= mlen+1);
3642                 lbnMul_64(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
3643                 plen = lbnNorm_64(p, plen + t0len);
3644                 assert(plen <= mlen);
3645                 if (plen > t1len) {
3646                         lbnZero_64(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
3647                         t1len = plen;
3648                 }
3649                 cy = lbnAddN_64(t1, p, plen);
3650                 if (cy) {
3651                         if (t1len > plen) {
3652                                 cy = lbnAdd1_64(BIGLITTLE(t1-plen,t0+plen),
3653                                                 t1len-plen, cy);
3654                         }
3655                         if (cy) {
3656                                 BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
3657                                 t1len++;
3658                         }
3659                 }
3660 #if BNYIELD
3661                 if (bnYield && (y = bnYield() < 0))
3662                         goto yield;
3663 #endif
3664         }
3665
3666         if (!blen)
3667                 goto failure;   /* gcd(a, mod) != 1 -- FAIL */
3668
3669         /* return mod-t1 */
3670         lbnCopy_64(a, mod, mlen);
3671         assert(t1len <= mlen);
3672         cy = lbnSubN_64(a, t1, t1len);
3673         if (cy) {
3674                 assert(mlen > t1len);
3675                 cy = lbnSub1_64(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
3676                 assert(!cy);
3677         }
3678
3679 success:
3680         LBNFREE(t1, mlen);
3681         LBNFREE(t0, mlen);
3682         LBNFREE(p, mlen+1);
3683         LBNFREE(b, mlen+1);
3684
3685         return 0;
3686
3687 failure:                /* GCD is not 1 - no inverse exists! */
3688         y = 1;
3689 #if BNYIELD
3690 yield:
3691 #endif
3692         LBNFREE(t1, mlen);
3693         LBNFREE(t0, mlen);
3694         LBNFREE(p, mlen+1);
3695         LBNFREE(b, mlen+1);
3696
3697         return y;
3698 }
3699
3700 /*
3701  * Precompute powers of "a" mod "mod".  Compute them every "bits"
3702  * for "n" steps.  This is sufficient to compute powers of g with
3703  * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
3704  *
3705  * This assumes that the caller has already initialized "array" to point
3706  * to "n" buffers of size "mlen".
3707  */
3708 int
3709 lbnBasePrecompBegin_64(BNWORD64 **array, unsigned n, unsigned bits,
3710         BNWORD64 const *g, unsigned glen, BNWORD64 *mod, unsigned mlen)
3711 {
3712         BNWORD64 *a, *b;        /* Temporary double-width accumulators */
3713         BNWORD64 *a1;   /* Pointer to high half of a*/
3714         BNWORD64 inv;   /* Montgomery inverse of LSW of mod */
3715         BNWORD64 *t;
3716         unsigned i;
3717
3718         glen = lbnNorm_64(g, glen);
3719         assert(glen);
3720
3721         assert (mlen == lbnNorm_64(mod, mlen));
3722         assert (glen <= mlen);
3723
3724         /* Allocate two temporary buffers, and the array slots */
3725         LBNALLOC(a, BNWORD64, mlen*2);
3726         if (!a)
3727                 return -1;
3728         LBNALLOC(b, BNWORD64, mlen*2);
3729         if (!b) {
3730                 LBNFREE(a, 2*mlen);
3731                 return -1;
3732         }
3733
3734         /* Okay, all ready */
3735
3736         /* Convert n to Montgomery form */
3737         inv = BIGLITTLE(mod[-1],mod[0]);        /* LSW of modulus */
3738         assert(inv & 1);        /* Modulus must be odd */
3739         inv = lbnMontInv1_64(inv);
3740         /* Move g up "mlen" words into a (clearing the low mlen words) */
3741         a1 = BIGLITTLE(a-mlen,a+mlen);
3742         lbnCopy_64(a1, g, glen);
3743         lbnZero_64(a, mlen);
3744
3745         /* Do the division - dump the quotient into the high-order words */
3746         (void)lbnDiv_64(a1, a, mlen+glen, mod, mlen);
3747
3748         /* Copy the first value into the array */
3749         t = *array;
3750         lbnCopy_64(t, a, mlen);
3751         a1 = a; /* This first value is *not* shifted up */
3752
3753         /* Now compute the remaining n-1 array entries */
3754         assert(bits);
3755         assert(n);
3756         while (--n) {
3757                 i = bits;
3758                 do {
3759                         /* Square a1 into b1 */
3760                         lbnMontSquare_64(b, a1, mod, mlen, inv);
3761                         t = b; b = a; a = t;
3762                         a1 = BIGLITTLE(a-mlen, a+mlen);
3763                 } while (--i);
3764                 t = *++array;
3765                 lbnCopy_64(t, a1, mlen);
3766         }
3767
3768         /* Hooray, we're done. */
3769         LBNFREE(b, 2*mlen);
3770         LBNFREE(a, 2*mlen);
3771         return 0;
3772 }
3773
3774 /*
3775  * result = base^exp (mod mod).  "array" is a an array of pointers
3776  * to procomputed powers of base, each 2^bits apart.  (I.e. array[i]
3777  * is base^(2^(i*bits))).
3778  *
3779  * The algorithm consists of:
3780  * a  = b  = (powers of g to be raised to the power 2^bits-1)
3781  * a *= b *= (powers of g to be raised to the power 2^bits-2)
3782  * ...
3783  * a *= b *= (powers of g to be raised to the power 1)
3784  *
3785  * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3786  */
3787 int
3788 lbnBasePrecompExp_64(BNWORD64 *result, BNWORD64 const * const *array,
3789        unsigned bits, BNWORD64 const *exp, unsigned elen,
3790        BNWORD64 const *mod, unsigned mlen)
3791 {
3792         BNWORD64 *a, *b, *c, *t;
3793         BNWORD64 *a1, *b1;
3794         int anull, bnull;       /* Null flags: values are implicitly 1 */
3795         unsigned i, j;                          /* Loop counters */
3796         unsigned mask;                          /* Exponent bits to examime */
3797         BNWORD64 const *eptr;                   /* Pointer into exp */
3798         BNWORD64 buf, curbits, nextword;        /* Bit-buffer varaibles */
3799         BNWORD64 inv;                           /* Inverse of LSW of modulus */
3800         unsigned ewords;                        /* Words of exponent left */
3801         int bufbits;                            /* Number of valid bits */
3802         int y = 0;
3803
3804         mlen = lbnNorm_64(mod, mlen);
3805         assert (mlen);
3806
3807         elen = lbnNorm_64(exp, elen);
3808         if (!elen) {
3809                 lbnZero_64(result, mlen);
3810                 BIGLITTLE(result[-1],result[0]) = 1;
3811                 return 0;
3812         }
3813         /*
3814          * This could be precomputed, but it's so cheap, and it would require
3815          * making the precomputation structure word-size dependent.
3816          */
3817         inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);     /* LSW of modulus */
3818
3819         assert(elen);
3820
3821         /*
3822          * Allocate three temporary buffers.  The current numbers generally
3823          * live in the upper halves of these buffers.
3824          */
3825         LBNALLOC(a, BNWORD64, mlen*2);
3826         if (a) {
3827                 LBNALLOC(b, BNWORD64, mlen*2);
3828                 if (b) {
3829                         LBNALLOC(c, BNWORD64, mlen*2);
3830                         if (c)
3831                                 goto allocated;
3832                         LBNFREE(b, 2*mlen);
3833                 }
3834                 LBNFREE(a, 2*mlen);
3835         }
3836         return -1;
3837
3838 allocated:
3839
3840         anull = bnull = 1;
3841
3842         mask = (1u<<bits) - 1;
3843         for (i = mask; i; --i) {
3844                 /* Set up bit buffer for walking the exponent */
3845                 eptr = exp;
3846                 buf = BIGLITTLE(*--eptr, *eptr++);
3847                 ewords = elen-1;
3848                 bufbits = 64;
3849                 for (j = 0; ewords || buf; j++) {
3850                         /* Shift down current buffer */
3851                         curbits = buf;
3852                         buf >>= bits;
3853                         /* If necessary, add next word */
3854                         bufbits -= bits;
3855                         if (bufbits < 0 && ewords > 0) {
3856                                 nextword = BIGLITTLE(*--eptr, *eptr++);
3857                                 ewords--;
3858                                 curbits |= nextword << (bufbits+bits);
3859                                 buf = nextword >> -bufbits;
3860                                 bufbits += 64;
3861                         }
3862                         /* If appropriate, multiply b *= array[j] */
3863                         if ((curbits & mask) == i) {
3864                                 BNWORD64 const *d = array[j];
3865
3866                                 b1 = BIGLITTLE(b-mlen-1,b+mlen);
3867                                 if (bnull) {
3868                                         lbnCopy_64(b1, d, mlen);
3869                                         bnull = 0;
3870                                 } else {
3871                                         lbnMontMul_64(c, b1, d, mod, mlen, inv);
3872                                         t = c; c = b; b = t;
3873                                 }
3874 #if BNYIELD
3875                                 if (bnYield && (y = bnYield() < 0))
3876                                         goto yield;
3877 #endif
3878                         }
3879                 }
3880
3881                 /* Multiply a *= b */
3882                 if (!bnull) {
3883                         a1 = BIGLITTLE(a-mlen-1,a+mlen);
3884                         b1 = BIGLITTLE(b-mlen-1,b+mlen);
3885                         if (anull) {
3886                                 lbnCopy_64(a1, b1, mlen);
3887                                 anull = 0;
3888                         } else {
3889                                 lbnMontMul_64(c, a1, b1, mod, mlen, inv);
3890                                 t = c; c = a; a = t;
3891                         }
3892                 }
3893         }
3894
3895         assert(!anull); /* If it were, elen would have been 0 */
3896
3897         /* Convert out of Montgomery form and return */
3898         a1 = BIGLITTLE(a-mlen-1,a+mlen);
3899         lbnCopy_64(a, a1, mlen);
3900         lbnZero_64(a1, mlen);
3901         lbnMontReduce_64(a, mod, mlen, inv);
3902         lbnCopy_64(result, a1, mlen);
3903
3904 #if BNYIELD
3905 yield:
3906 #endif
3907         LBNFREE(c, 2*mlen);
3908         LBNFREE(b, 2*mlen);
3909         LBNFREE(a, 2*mlen);
3910
3911         return y;
3912 }
3913
3914 /*
3915  * result = base1^exp1 *base2^exp2 (mod mod).  "array1" and "array2" are
3916  * arrays of pointers to procomputed powers of the corresponding bases,
3917  * each 2^bits apart.  (I.e. array1[i] is base1^(2^(i*bits))).
3918  *
3919  * Bits must be the same in both.  (It could be made adjustable, but it's
3920  * a bit of a pain.  Just make them both equal to the larger one.)
3921  *
3922  * The algorithm consists of:
3923  * a  = b  = (powers of base1 and base2  to be raised to the power 2^bits-1)
3924  * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
3925  * ...
3926  * a *= b *= (powers of base1 and base2 to be raised to the power 1)
3927  *
3928  * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3929  */
3930 int
3931 lbnDoubleBasePrecompExp_64(BNWORD64 *result, unsigned bits,
3932        BNWORD64 const * const *array1, BNWORD64 const *exp1, unsigned elen1,
3933        BNWORD64 const * const *array2, BNWORD64 const *exp2,
3934        unsigned elen2, BNWORD64 const *mod, unsigned mlen)
3935 {
3936         BNWORD64 *a, *b, *c, *t;
3937         BNWORD64 *a1, *b1;
3938         int anull, bnull;       /* Null flags: values are implicitly 1 */
3939         unsigned i, j, k;                               /* Loop counters */
3940         unsigned mask;                          /* Exponent bits to examime */
3941         BNWORD64 const *eptr;                   /* Pointer into exp */
3942         BNWORD64 buf, curbits, nextword;        /* Bit-buffer varaibles */
3943         BNWORD64 inv;                           /* Inverse of LSW of modulus */
3944         unsigned ewords;                        /* Words of exponent left */
3945         int bufbits;                            /* Number of valid bits */
3946         int y = 0;
3947         BNWORD64 const * const *array;
3948
3949         mlen = lbnNorm_64(mod, mlen);
3950         assert (mlen);
3951
3952         elen1 = lbnNorm_64(exp1, elen1);
3953         if (!elen1) {
3954                 return lbnBasePrecompExp_64(result, array2, bits, exp2, elen2,
3955                                             mod, mlen);
3956         }
3957         elen2 = lbnNorm_64(exp2, elen2);
3958         if (!elen2) {
3959                 return lbnBasePrecompExp_64(result, array1, bits, exp1, elen1,
3960                                             mod, mlen);
3961         }
3962         /*
3963          * This could be precomputed, but it's so cheap, and it would require
3964          * making the precomputation structure word-size dependent.
3965          */
3966         inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]);     /* LSW of modulus */
3967
3968         assert(elen1);
3969         assert(elen2);
3970
3971         /*
3972          * Allocate three temporary buffers.  The current numbers generally
3973          * live in the upper halves of these buffers.
3974          */
3975         LBNALLOC(a, BNWORD64, mlen*2);
3976         if (a) {
3977                 LBNALLOC(b, BNWORD64, mlen*2);
3978                 if (b) {
3979                         LBNALLOC(c, BNWORD64, mlen*2);
3980                         if (c)
3981                                 goto allocated;
3982                         LBNFREE(b, 2*mlen);
3983                 }
3984                 LBNFREE(a, 2*mlen);
3985         }
3986         return -1;
3987
3988 allocated:
3989
3990         anull = bnull = 1;
3991
3992         mask = (1u<<bits) - 1;
3993         for (i = mask; i; --i) {
3994                 /* Walk each exponent in turn */
3995                 for (k = 0; k < 2; k++) {
3996                         /* Set up the exponent for walking */
3997                         array = k ? array2 : array1;
3998                         eptr = k ? exp2 : exp1;
3999                         ewords = (k ? elen2 : elen1) - 1;
4000                         /* Set up bit buffer for walking the exponent */
4001                         buf = BIGLITTLE(*--eptr, *eptr++);
4002                         bufbits = 64;
4003                         for (j = 0; ewords || buf; j++) {
4004                                 /* Shift down current buffer */
4005                                 curbits = buf;
4006                                 buf >>= bits;
4007                                 /* If necessary, add next word */
4008                                 bufbits -= bits;
4009                                 if (bufbits < 0 && ewords > 0) {
4010                                         nextword = BIGLITTLE(*--eptr, *eptr++);
4011                                         ewords--;
4012                                         curbits |= nextword << (bufbits+bits);
4013                                         buf = nextword >> -bufbits;
4014                                         bufbits += 64;
4015                                 }
4016                                 /* If appropriate, multiply b *= array[j] */
4017                                 if ((curbits & mask) == i) {
4018                                         BNWORD64 const *d = array[j];
4019
4020                                         b1 = BIGLITTLE(b-mlen-1,b+mlen);
4021                                         if (bnull) {
4022                                                 lbnCopy_64(b1, d, mlen);
4023                                                 bnull = 0;
4024                                         } else {
4025                                                 lbnMontMul_64(c, b1, d, mod, mlen, inv);
4026                                                 t = c; c = b; b = t;
4027                                         }
4028 #if BNYIELD
4029                                         if (bnYield && (y = bnYield() < 0))
4030                                                 goto yield;
4031 #endif
4032                                 }
4033                         }
4034                 }
4035
4036                 /* Multiply a *= b */
4037                 if (!bnull) {
4038                         a1 = BIGLITTLE(a-mlen-1,a+mlen);
4039                         b1 = BIGLITTLE(b-mlen-1,b+mlen);
4040                         if (anull) {
4041                                 lbnCopy_64(a1, b1, mlen);
4042                                 anull = 0;
4043                         } else {
4044                                 lbnMontMul_64(c, a1, b1, mod, mlen, inv);
4045                                 t = c; c = a; a = t;
4046                         }
4047                 }
4048         }
4049
4050         assert(!anull); /* If it were, elen would have been 0 */
4051
4052         /* Convert out of Montgomery form and return */
4053         a1 = BIGLITTLE(a-mlen-1,a+mlen);
4054         lbnCopy_64(a, a1, mlen);
4055         lbnZero_64(a1, mlen);
4056         lbnMontReduce_64(a, mod, mlen, inv);
4057         lbnCopy_64(result, a1, mlen);
4058
4059 #if BNYIELD
4060 yield:
4061 #endif
4062         LBNFREE(c, 2*mlen);
4063         LBNFREE(b, 2*mlen);
4064         LBNFREE(a, 2*mlen);
4065
4066         return y;
4067 }