crypto/bn/bn_asm.c

   1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   2  * All rights reserved.
   3  *
   4  * This package is an SSL implementation written
   5  * by Eric Young (eay@cryptsoft.com).
   6  * The implementation was written so as to conform with Netscapes SSL.
   7  *
   8  * This library is free for commercial and non-commercial use as long as
   9  * the following conditions are aheared to.  The following conditions
  10  * apply to all code found in this distribution, be it the RC4, RSA,
  11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  12  * included with this distribution is covered by the same copyright terms
  13  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  14  *
  15  * Copyright remains Eric Young's, and as such any Copyright notices in
  16  * the code are not to be removed.
  17  * If this package is used in a product, Eric Young should be given attribution
  18  * as the author of the parts of the library used.
  19  * This can be in the form of a textual message at program startup or
  20  * in documentation (online or textual) provided with the package.
  21  *
  22  * Redistribution and use in source and binary forms, with or without
  23  * modification, are permitted provided that the following conditions
  24  * are met:
  25  * 1. Redistributions of source code must retain the copyright
  26  *    notice, this list of conditions and the following disclaimer.
  27  * 2. Redistributions in binary form must reproduce the above copyright
  28  *    notice, this list of conditions and the following disclaimer in the
  29  *    documentation and/or other materials provided with the distribution.
  30  * 3. All advertising materials mentioning features or use of this software
  31  *    must display the following acknowledgement:
  32  *    "This product includes cryptographic software written by
  33  *     Eric Young (eay@cryptsoft.com)"
  34  *    The word 'cryptographic' can be left out if the rouines from the library
  35  *    being used are not cryptographic related :-).
  36  * 4. If you include any Windows specific code (or a derivative thereof) from
  37  *    the apps directory (application code) you must include an acknowledgement:
  38  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  39  *
  40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  50  * SUCH DAMAGE.
  51  *
  52  * The licence and distribution terms for any publically available version or
  53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  54  * copied and put under another distribution licence
  55  * [including the GNU Public Licence.]
  56  */
  57
  58 #ifndef BN_DEBUG
  59 # undef NDEBUG                  /* avoid conflicting definitions */
  60 # define NDEBUG
  61 #endif
  62
  63 #include <assert.h>
  64 #include <openssl/crypto.h>
  65 #include "internal/cryptlib.h"
  66 #include "bn_lcl.h"
  67
  68 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  69
  70 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  71                           BN_ULONG w)
  72 {
  73     BN_ULONG c1 = 0;
  74
  75     assert(num >= 0);
  76     if (num <= 0)
  77         return (c1);
  78
  79 # ifndef OPENSSL_SMALL_FOOTPRINT
  80     while (num & ~3) {
  81         mul_add(rp[0], ap[0], w, c1);
  82         mul_add(rp[1], ap[1], w, c1);
  83         mul_add(rp[2], ap[2], w, c1);
  84         mul_add(rp[3], ap[3], w, c1);
  85         ap += 4;
  86         rp += 4;
  87         num -= 4;
  88     }
  89 # endif
  90     while (num) {
  91         mul_add(rp[0], ap[0], w, c1);
  92         ap++;
  93         rp++;
  94         num--;
  95     }
  96
  97     return (c1);
  98 }
  99
 100 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 101 {
 102     BN_ULONG c1 = 0;
 103
 104     assert(num >= 0);
 105     if (num <= 0)
 106         return (c1);
 107
 108 # ifndef OPENSSL_SMALL_FOOTPRINT
 109     while (num & ~3) {
 110         mul(rp[0], ap[0], w, c1);
 111         mul(rp[1], ap[1], w, c1);
 112         mul(rp[2], ap[2], w, c1);
 113         mul(rp[3], ap[3], w, c1);
 114         ap += 4;
 115         rp += 4;
 116         num -= 4;
 117     }
 118 # endif
 119     while (num) {
 120         mul(rp[0], ap[0], w, c1);
 121         ap++;
 122         rp++;
 123         num--;
 124     }
 125     return (c1);
 126 }
 127
 128 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 129 {
 130     assert(n >= 0);
 131     if (n <= 0)
 132         return;
 133
 134 # ifndef OPENSSL_SMALL_FOOTPRINT
 135     while (n & ~3) {
 136         sqr(r[0], r[1], a[0]);
 137         sqr(r[2], r[3], a[1]);
 138         sqr(r[4], r[5], a[2]);
 139         sqr(r[6], r[7], a[3]);
 140         a += 4;
 141         r += 8;
 142         n -= 4;
 143     }
 144 # endif
 145     while (n) {
 146         sqr(r[0], r[1], a[0]);
 147         a++;
 148         r += 2;
 149         n--;
 150     }
 151 }
 152
 153 #else                           /* !(defined(BN_LLONG) ||
 154                                  * defined(BN_UMULT_HIGH)) */
 155
 156 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
 157                           BN_ULONG w)
 158 {
 159     BN_ULONG c = 0;
 160     BN_ULONG bl, bh;
 161
 162     assert(num >= 0);
 163     if (num <= 0)
 164         return ((BN_ULONG)0);
 165
 166     bl = LBITS(w);
 167     bh = HBITS(w);
 168
 169 # ifndef OPENSSL_SMALL_FOOTPRINT
 170     while (num & ~3) {
 171         mul_add(rp[0], ap[0], bl, bh, c);
 172         mul_add(rp[1], ap[1], bl, bh, c);
 173         mul_add(rp[2], ap[2], bl, bh, c);
 174         mul_add(rp[3], ap[3], bl, bh, c);
 175         ap += 4;
 176         rp += 4;
 177         num -= 4;
 178     }
 179 # endif
 180     while (num) {
 181         mul_add(rp[0], ap[0], bl, bh, c);
 182         ap++;
 183         rp++;
 184         num--;
 185     }
 186     return (c);
 187 }
 188
 189 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 190 {
 191     BN_ULONG carry = 0;
 192     BN_ULONG bl, bh;
 193
 194     assert(num >= 0);
 195     if (num <= 0)
 196         return ((BN_ULONG)0);
 197
 198     bl = LBITS(w);
 199     bh = HBITS(w);
 200
 201 # ifndef OPENSSL_SMALL_FOOTPRINT
 202     while (num & ~3) {
 203         mul(rp[0], ap[0], bl, bh, carry);
 204         mul(rp[1], ap[1], bl, bh, carry);
 205         mul(rp[2], ap[2], bl, bh, carry);
 206         mul(rp[3], ap[3], bl, bh, carry);
 207         ap += 4;
 208         rp += 4;
 209         num -= 4;
 210     }
 211 # endif
 212     while (num) {
 213         mul(rp[0], ap[0], bl, bh, carry);
 214         ap++;
 215         rp++;
 216         num--;
 217     }
 218     return (carry);
 219 }
 220
 221 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 222 {
 223     assert(n >= 0);
 224     if (n <= 0)
 225         return;
 226
 227 # ifndef OPENSSL_SMALL_FOOTPRINT
 228     while (n & ~3) {
 229         sqr64(r[0], r[1], a[0]);
 230         sqr64(r[2], r[3], a[1]);
 231         sqr64(r[4], r[5], a[2]);
 232         sqr64(r[6], r[7], a[3]);
 233         a += 4;
 234         r += 8;
 235         n -= 4;
 236     }
 237 # endif
 238     while (n) {
 239         sqr64(r[0], r[1], a[0]);
 240         a++;
 241         r += 2;
 242         n--;
 243     }
 244 }
 245
 246 #endif                          /* !(defined(BN_LLONG) ||
 247                                  * defined(BN_UMULT_HIGH)) */
 248
 249 #if defined(BN_LLONG) && defined(BN_DIV2W)
 250
 251 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 252 {
 253     return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
 254 }
 255
 256 #else
 257
 258 /* Divide h,l by d and return the result. */
 259 /* I need to test this some more :-( */
 260 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 261 {
 262     BN_ULONG dh, dl, q, ret = 0, th, tl, t;
 263     int i, count = 2;
 264
 265     if (d == 0)
 266         return (BN_MASK2);
 267
 268     i = BN_num_bits_word(d);
 269     assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
 270
 271     i = BN_BITS2 - i;
 272     if (h >= d)
 273         h -= d;
 274
 275     if (i) {
 276         d <<= i;
 277         h = (h << i) | (l >> (BN_BITS2 - i));
 278         l <<= i;
 279     }
 280     dh = (d & BN_MASK2h) >> BN_BITS4;
 281     dl = (d & BN_MASK2l);
 282     for (;;) {
 283         if ((h >> BN_BITS4) == dh)
 284             q = BN_MASK2l;
 285         else
 286             q = h / dh;
 287
 288         th = q * dh;
 289         tl = dl * q;
 290         for (;;) {
 291             t = h - th;
 292             if ((t & BN_MASK2h) ||
 293                 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
 294                 break;
 295             q--;
 296             th -= dh;
 297             tl -= dl;
 298         }
 299         t = (tl >> BN_BITS4);
 300         tl = (tl << BN_BITS4) & BN_MASK2h;
 301         th += t;
 302
 303         if (l < tl)
 304             th++;
 305         l -= tl;
 306         if (h < th) {
 307             h += d;
 308             q--;
 309         }
 310         h -= th;
 311
 312         if (--count == 0)
 313             break;
 314
 315         ret = q << BN_BITS4;
 316         h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
 317         l = (l & BN_MASK2l) << BN_BITS4;
 318     }
 319     ret |= q;
 320     return (ret);
 321 }
 322 #endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 323
 324 #ifdef BN_LLONG
 325 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 326                       int n)
 327 {
 328     BN_ULLONG ll = 0;
 329
 330     assert(n >= 0);
 331     if (n <= 0)
 332         return ((BN_ULONG)0);
 333
 334 # ifndef OPENSSL_SMALL_FOOTPRINT
 335     while (n & ~3) {
 336         ll += (BN_ULLONG) a[0] + b[0];
 337         r[0] = (BN_ULONG)ll & BN_MASK2;
 338         ll >>= BN_BITS2;
 339         ll += (BN_ULLONG) a[1] + b[1];
 340         r[1] = (BN_ULONG)ll & BN_MASK2;
 341         ll >>= BN_BITS2;
 342         ll += (BN_ULLONG) a[2] + b[2];
 343         r[2] = (BN_ULONG)ll & BN_MASK2;
 344         ll >>= BN_BITS2;
 345         ll += (BN_ULLONG) a[3] + b[3];
 346         r[3] = (BN_ULONG)ll & BN_MASK2;
 347         ll >>= BN_BITS2;
 348         a += 4;
 349         b += 4;
 350         r += 4;
 351         n -= 4;
 352     }
 353 # endif
 354     while (n) {
 355         ll += (BN_ULLONG) a[0] + b[0];
 356         r[0] = (BN_ULONG)ll & BN_MASK2;
 357         ll >>= BN_BITS2;
 358         a++;
 359         b++;
 360         r++;
 361         n--;
 362     }
 363     return ((BN_ULONG)ll);
 364 }
 365 #else                           /* !BN_LLONG */
 366 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 367                       int n)
 368 {
 369     BN_ULONG c, l, t;
 370
 371     assert(n >= 0);
 372     if (n <= 0)
 373         return ((BN_ULONG)0);
 374
 375     c = 0;
 376 # ifndef OPENSSL_SMALL_FOOTPRINT
 377     while (n & ~3) {
 378         t = a[0];
 379         t = (t + c) & BN_MASK2;
 380         c = (t < c);
 381         l = (t + b[0]) & BN_MASK2;
 382         c += (l < t);
 383         r[0] = l;
 384         t = a[1];
 385         t = (t + c) & BN_MASK2;
 386         c = (t < c);
 387         l = (t + b[1]) & BN_MASK2;
 388         c += (l < t);
 389         r[1] = l;
 390         t = a[2];
 391         t = (t + c) & BN_MASK2;
 392         c = (t < c);
 393         l = (t + b[2]) & BN_MASK2;
 394         c += (l < t);
 395         r[2] = l;
 396         t = a[3];
 397         t = (t + c) & BN_MASK2;
 398         c = (t < c);
 399         l = (t + b[3]) & BN_MASK2;
 400         c += (l < t);
 401         r[3] = l;
 402         a += 4;
 403         b += 4;
 404         r += 4;
 405         n -= 4;
 406     }
 407 # endif
 408     while (n) {
 409         t = a[0];
 410         t = (t + c) & BN_MASK2;
 411         c = (t < c);
 412         l = (t + b[0]) & BN_MASK2;
 413         c += (l < t);
 414         r[0] = l;
 415         a++;
 416         b++;
 417         r++;
 418         n--;
 419     }
 420     return ((BN_ULONG)c);
 421 }
 422 #endif                          /* !BN_LLONG */
 423
 424 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 425                       int n)
 426 {
 427     BN_ULONG t1, t2;
 428     int c = 0;
 429
 430     assert(n >= 0);
 431     if (n <= 0)
 432         return ((BN_ULONG)0);
 433
 434 #ifndef OPENSSL_SMALL_FOOTPRINT
 435     while (n & ~3) {
 436         t1 = a[0];
 437         t2 = b[0];
 438         r[0] = (t1 - t2 - c) & BN_MASK2;
 439         if (t1 != t2)
 440             c = (t1 < t2);
 441         t1 = a[1];
 442         t2 = b[1];
 443         r[1] = (t1 - t2 - c) & BN_MASK2;
 444         if (t1 != t2)
 445             c = (t1 < t2);
 446         t1 = a[2];
 447         t2 = b[2];
 448         r[2] = (t1 - t2 - c) & BN_MASK2;
 449         if (t1 != t2)
 450             c = (t1 < t2);
 451         t1 = a[3];
 452         t2 = b[3];
 453         r[3] = (t1 - t2 - c) & BN_MASK2;
 454         if (t1 != t2)
 455             c = (t1 < t2);
 456         a += 4;
 457         b += 4;
 458         r += 4;
 459         n -= 4;
 460     }
 461 #endif
 462     while (n) {
 463         t1 = a[0];
 464         t2 = b[0];
 465         r[0] = (t1 - t2 - c) & BN_MASK2;
 466         if (t1 != t2)
 467             c = (t1 < t2);
 468         a++;
 469         b++;
 470         r++;
 471         n--;
 472     }
 473     return (c);
 474 }
 475
 476 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 477
 478 # undef bn_mul_comba8
 479 # undef bn_mul_comba4
 480 # undef bn_sqr_comba8
 481 # undef bn_sqr_comba4
 482
 483 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 484 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 485 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 486 /*
 487  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
 488  * c=(c2,c1,c0)
 489  */
 490
 491 # ifdef BN_LLONG
 492 /*
 493  * Keep in mind that additions to multiplication result can not
 494  * overflow, because its high half cannot be all-ones.
 495  */
 496 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 497         BN_ULONG hi;                            \
 498         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 499         t += c0;                /* no carry */  \
 500         c0 = (BN_ULONG)Lw(t);                   \
 501         hi = (BN_ULONG)Hw(t);                   \
 502         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 503         } while(0)
 504
 505 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 506         BN_ULONG hi;                            \
 507         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 508         BN_ULLONG tt = t+c0;    /* no carry */  \
 509         c0 = (BN_ULONG)Lw(tt);                  \
 510         hi = (BN_ULONG)Hw(tt);                  \
 511         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 512         t += c0;                /* no carry */  \
 513         c0 = (BN_ULONG)Lw(t);                   \
 514         hi = (BN_ULONG)Hw(t);                   \
 515         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 516         } while(0)
 517
 518 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 519         BN_ULONG hi;                            \
 520         BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
 521         t += c0;                /* no carry */  \
 522         c0 = (BN_ULONG)Lw(t);                   \
 523         hi = (BN_ULONG)Hw(t);                   \
 524         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 525         } while(0)
 526
 527 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 528         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 529
 530 # elif defined(BN_UMULT_LOHI)
 531 /*
 532  * Keep in mind that additions to hi can not overflow, because
 533  * the high word of a multiplication result cannot be all-ones.
 534  */
 535 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 536         BN_ULONG ta = (a), tb = (b);            \
 537         BN_ULONG lo, hi;                        \
 538         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 539         c0 += lo; hi += (c0<lo)?1:0;            \
 540         c1 += hi; c2 += (c1<hi)?1:0;            \
 541         } while(0)
 542
 543 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 544         BN_ULONG ta = (a), tb = (b);            \
 545         BN_ULONG lo, hi, tt;                    \
 546         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 547         c0 += lo; tt = hi+((c0<lo)?1:0);        \
 548         c1 += tt; c2 += (c1<tt)?1:0;            \
 549         c0 += lo; hi += (c0<lo)?1:0;            \
 550         c1 += hi; c2 += (c1<hi)?1:0;            \
 551         } while(0)
 552
 553 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 554         BN_ULONG ta = (a)[i];                   \
 555         BN_ULONG lo, hi;                        \
 556         BN_UMULT_LOHI(lo,hi,ta,ta);             \
 557         c0 += lo; hi += (c0<lo)?1:0;            \
 558         c1 += hi; c2 += (c1<hi)?1:0;            \
 559         } while(0)
 560
 561 #  define sqr_add_c2(a,i,j,c0,c1,c2)    \
 562         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 563
 564 # elif defined(BN_UMULT_HIGH)
 565 /*
 566  * Keep in mind that additions to hi can not overflow, because
 567  * the high word of a multiplication result cannot be all-ones.
 568  */
 569 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 570         BN_ULONG ta = (a), tb = (b);            \
 571         BN_ULONG lo = ta * tb;                  \
 572         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 573         c0 += lo; hi += (c0<lo)?1:0;            \
 574         c1 += hi; c2 += (c1<hi)?1:0;            \
 575         } while(0)
 576
 577 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 578         BN_ULONG ta = (a), tb = (b), tt;        \
 579         BN_ULONG lo = ta * tb;                  \
 580         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 581         c0 += lo; tt = hi + ((c0<lo)?1:0);      \
 582         c1 += tt; c2 += (c1<tt)?1:0;            \
 583         c0 += lo; hi += (c0<lo)?1:0;            \
 584         c1 += hi; c2 += (c1<hi)?1:0;            \
 585         } while(0)
 586
 587 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 588         BN_ULONG ta = (a)[i];                   \
 589         BN_ULONG lo = ta * ta;                  \
 590         BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
 591         c0 += lo; hi += (c0<lo)?1:0;            \
 592         c1 += hi; c2 += (c1<hi)?1:0;            \
 593         } while(0)
 594
 595 #  define sqr_add_c2(a,i,j,c0,c1,c2)      \
 596         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 597
 598 # else                          /* !BN_LLONG */
 599 /*
 600  * Keep in mind that additions to hi can not overflow, because
 601  * the high word of a multiplication result cannot be all-ones.
 602  */
 603 #  define mul_add_c(a,b,c0,c1,c2)       do {    \
 604         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 605         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 606         mul64(lo,hi,bl,bh);                     \
 607         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 608         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 609         } while(0)
 610
 611 #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
 612         BN_ULONG tt;                            \
 613         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 614         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 615         mul64(lo,hi,bl,bh);                     \
 616         tt = hi;                                \
 617         c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
 618         c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
 619         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 620         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 621         } while(0)
 622
 623 #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
 624         BN_ULONG lo, hi;                        \
 625         sqr64(lo,hi,(a)[i]);                    \
 626         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 627         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 628         } while(0)
 629
 630 #  define sqr_add_c2(a,i,j,c0,c1,c2) \
 631         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 632 # endif                         /* !BN_LLONG */
 633
 634 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 635 {
 636     BN_ULONG c1, c2, c3;
 637
 638     c1 = 0;
 639     c2 = 0;
 640     c3 = 0;
 641     mul_add_c(a[0], b[0], c1, c2, c3);
 642     r[0] = c1;
 643     c1 = 0;
 644     mul_add_c(a[0], b[1], c2, c3, c1);
 645     mul_add_c(a[1], b[0], c2, c3, c1);
 646     r[1] = c2;
 647     c2 = 0;
 648     mul_add_c(a[2], b[0], c3, c1, c2);
 649     mul_add_c(a[1], b[1], c3, c1, c2);
 650     mul_add_c(a[0], b[2], c3, c1, c2);
 651     r[2] = c3;
 652     c3 = 0;
 653     mul_add_c(a[0], b[3], c1, c2, c3);
 654     mul_add_c(a[1], b[2], c1, c2, c3);
 655     mul_add_c(a[2], b[1], c1, c2, c3);
 656     mul_add_c(a[3], b[0], c1, c2, c3);
 657     r[3] = c1;
 658     c1 = 0;
 659     mul_add_c(a[4], b[0], c2, c3, c1);
 660     mul_add_c(a[3], b[1], c2, c3, c1);
 661     mul_add_c(a[2], b[2], c2, c3, c1);
 662     mul_add_c(a[1], b[3], c2, c3, c1);
 663     mul_add_c(a[0], b[4], c2, c3, c1);
 664     r[4] = c2;
 665     c2 = 0;
 666     mul_add_c(a[0], b[5], c3, c1, c2);
 667     mul_add_c(a[1], b[4], c3, c1, c2);
 668     mul_add_c(a[2], b[3], c3, c1, c2);
 669     mul_add_c(a[3], b[2], c3, c1, c2);
 670     mul_add_c(a[4], b[1], c3, c1, c2);
 671     mul_add_c(a[5], b[0], c3, c1, c2);
 672     r[5] = c3;
 673     c3 = 0;
 674     mul_add_c(a[6], b[0], c1, c2, c3);
 675     mul_add_c(a[5], b[1], c1, c2, c3);
 676     mul_add_c(a[4], b[2], c1, c2, c3);
 677     mul_add_c(a[3], b[3], c1, c2, c3);
 678     mul_add_c(a[2], b[4], c1, c2, c3);
 679     mul_add_c(a[1], b[5], c1, c2, c3);
 680     mul_add_c(a[0], b[6], c1, c2, c3);
 681     r[6] = c1;
 682     c1 = 0;
 683     mul_add_c(a[0], b[7], c2, c3, c1);
 684     mul_add_c(a[1], b[6], c2, c3, c1);
 685     mul_add_c(a[2], b[5], c2, c3, c1);
 686     mul_add_c(a[3], b[4], c2, c3, c1);
 687     mul_add_c(a[4], b[3], c2, c3, c1);
 688     mul_add_c(a[5], b[2], c2, c3, c1);
 689     mul_add_c(a[6], b[1], c2, c3, c1);
 690     mul_add_c(a[7], b[0], c2, c3, c1);
 691     r[7] = c2;
 692     c2 = 0;
 693     mul_add_c(a[7], b[1], c3, c1, c2);
 694     mul_add_c(a[6], b[2], c3, c1, c2);
 695     mul_add_c(a[5], b[3], c3, c1, c2);
 696     mul_add_c(a[4], b[4], c3, c1, c2);
 697     mul_add_c(a[3], b[5], c3, c1, c2);
 698     mul_add_c(a[2], b[6], c3, c1, c2);
 699     mul_add_c(a[1], b[7], c3, c1, c2);
 700     r[8] = c3;
 701     c3 = 0;
 702     mul_add_c(a[2], b[7], c1, c2, c3);
 703     mul_add_c(a[3], b[6], c1, c2, c3);
 704     mul_add_c(a[4], b[5], c1, c2, c3);
 705     mul_add_c(a[5], b[4], c1, c2, c3);
 706     mul_add_c(a[6], b[3], c1, c2, c3);
 707     mul_add_c(a[7], b[2], c1, c2, c3);
 708     r[9] = c1;
 709     c1 = 0;
 710     mul_add_c(a[7], b[3], c2, c3, c1);
 711     mul_add_c(a[6], b[4], c2, c3, c1);
 712     mul_add_c(a[5], b[5], c2, c3, c1);
 713     mul_add_c(a[4], b[6], c2, c3, c1);
 714     mul_add_c(a[3], b[7], c2, c3, c1);
 715     r[10] = c2;
 716     c2 = 0;
 717     mul_add_c(a[4], b[7], c3, c1, c2);
 718     mul_add_c(a[5], b[6], c3, c1, c2);
 719     mul_add_c(a[6], b[5], c3, c1, c2);
 720     mul_add_c(a[7], b[4], c3, c1, c2);
 721     r[11] = c3;
 722     c3 = 0;
 723     mul_add_c(a[7], b[5], c1, c2, c3);
 724     mul_add_c(a[6], b[6], c1, c2, c3);
 725     mul_add_c(a[5], b[7], c1, c2, c3);
 726     r[12] = c1;
 727     c1 = 0;
 728     mul_add_c(a[6], b[7], c2, c3, c1);
 729     mul_add_c(a[7], b[6], c2, c3, c1);
 730     r[13] = c2;
 731     c2 = 0;
 732     mul_add_c(a[7], b[7], c3, c1, c2);
 733     r[14] = c3;
 734     r[15] = c1;
 735 }
 736
 737 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 738 {
 739     BN_ULONG c1, c2, c3;
 740
 741     c1 = 0;
 742     c2 = 0;
 743     c3 = 0;
 744     mul_add_c(a[0], b[0], c1, c2, c3);
 745     r[0] = c1;
 746     c1 = 0;
 747     mul_add_c(a[0], b[1], c2, c3, c1);
 748     mul_add_c(a[1], b[0], c2, c3, c1);
 749     r[1] = c2;
 750     c2 = 0;
 751     mul_add_c(a[2], b[0], c3, c1, c2);
 752     mul_add_c(a[1], b[1], c3, c1, c2);
 753     mul_add_c(a[0], b[2], c3, c1, c2);
 754     r[2] = c3;
 755     c3 = 0;
 756     mul_add_c(a[0], b[3], c1, c2, c3);
 757     mul_add_c(a[1], b[2], c1, c2, c3);
 758     mul_add_c(a[2], b[1], c1, c2, c3);
 759     mul_add_c(a[3], b[0], c1, c2, c3);
 760     r[3] = c1;
 761     c1 = 0;
 762     mul_add_c(a[3], b[1], c2, c3, c1);
 763     mul_add_c(a[2], b[2], c2, c3, c1);
 764     mul_add_c(a[1], b[3], c2, c3, c1);
 765     r[4] = c2;
 766     c2 = 0;
 767     mul_add_c(a[2], b[3], c3, c1, c2);
 768     mul_add_c(a[3], b[2], c3, c1, c2);
 769     r[5] = c3;
 770     c3 = 0;
 771     mul_add_c(a[3], b[3], c1, c2, c3);
 772     r[6] = c1;
 773     r[7] = c2;
 774 }
 775
 776 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 777 {
 778     BN_ULONG c1, c2, c3;
 779
 780     c1 = 0;
 781     c2 = 0;
 782     c3 = 0;
 783     sqr_add_c(a, 0, c1, c2, c3);
 784     r[0] = c1;
 785     c1 = 0;
 786     sqr_add_c2(a, 1, 0, c2, c3, c1);
 787     r[1] = c2;
 788     c2 = 0;
 789     sqr_add_c(a, 1, c3, c1, c2);
 790     sqr_add_c2(a, 2, 0, c3, c1, c2);
 791     r[2] = c3;
 792     c3 = 0;
 793     sqr_add_c2(a, 3, 0, c1, c2, c3);
 794     sqr_add_c2(a, 2, 1, c1, c2, c3);
 795     r[3] = c1;
 796     c1 = 0;
 797     sqr_add_c(a, 2, c2, c3, c1);
 798     sqr_add_c2(a, 3, 1, c2, c3, c1);
 799     sqr_add_c2(a, 4, 0, c2, c3, c1);
 800     r[4] = c2;
 801     c2 = 0;
 802     sqr_add_c2(a, 5, 0, c3, c1, c2);
 803     sqr_add_c2(a, 4, 1, c3, c1, c2);
 804     sqr_add_c2(a, 3, 2, c3, c1, c2);
 805     r[5] = c3;
 806     c3 = 0;
 807     sqr_add_c(a, 3, c1, c2, c3);
 808     sqr_add_c2(a, 4, 2, c1, c2, c3);
 809     sqr_add_c2(a, 5, 1, c1, c2, c3);
 810     sqr_add_c2(a, 6, 0, c1, c2, c3);
 811     r[6] = c1;
 812     c1 = 0;
 813     sqr_add_c2(a, 7, 0, c2, c3, c1);
 814     sqr_add_c2(a, 6, 1, c2, c3, c1);
 815     sqr_add_c2(a, 5, 2, c2, c3, c1);
 816     sqr_add_c2(a, 4, 3, c2, c3, c1);
 817     r[7] = c2;
 818     c2 = 0;
 819     sqr_add_c(a, 4, c3, c1, c2);
 820     sqr_add_c2(a, 5, 3, c3, c1, c2);
 821     sqr_add_c2(a, 6, 2, c3, c1, c2);
 822     sqr_add_c2(a, 7, 1, c3, c1, c2);
 823     r[8] = c3;
 824     c3 = 0;
 825     sqr_add_c2(a, 7, 2, c1, c2, c3);
 826     sqr_add_c2(a, 6, 3, c1, c2, c3);
 827     sqr_add_c2(a, 5, 4, c1, c2, c3);
 828     r[9] = c1;
 829     c1 = 0;
 830     sqr_add_c(a, 5, c2, c3, c1);
 831     sqr_add_c2(a, 6, 4, c2, c3, c1);
 832     sqr_add_c2(a, 7, 3, c2, c3, c1);
 833     r[10] = c2;
 834     c2 = 0;
 835     sqr_add_c2(a, 7, 4, c3, c1, c2);
 836     sqr_add_c2(a, 6, 5, c3, c1, c2);
 837     r[11] = c3;
 838     c3 = 0;
 839     sqr_add_c(a, 6, c1, c2, c3);
 840     sqr_add_c2(a, 7, 5, c1, c2, c3);
 841     r[12] = c1;
 842     c1 = 0;
 843     sqr_add_c2(a, 7, 6, c2, c3, c1);
 844     r[13] = c2;
 845     c2 = 0;
 846     sqr_add_c(a, 7, c3, c1, c2);
 847     r[14] = c3;
 848     r[15] = c1;
 849 }
 850
 851 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 852 {
 853     BN_ULONG c1, c2, c3;
 854
 855     c1 = 0;
 856     c2 = 0;
 857     c3 = 0;
 858     sqr_add_c(a, 0, c1, c2, c3);
 859     r[0] = c1;
 860     c1 = 0;
 861     sqr_add_c2(a, 1, 0, c2, c3, c1);
 862     r[1] = c2;
 863     c2 = 0;
 864     sqr_add_c(a, 1, c3, c1, c2);
 865     sqr_add_c2(a, 2, 0, c3, c1, c2);
 866     r[2] = c3;
 867     c3 = 0;
 868     sqr_add_c2(a, 3, 0, c1, c2, c3);
 869     sqr_add_c2(a, 2, 1, c1, c2, c3);
 870     r[3] = c1;
 871     c1 = 0;
 872     sqr_add_c(a, 2, c2, c3, c1);
 873     sqr_add_c2(a, 3, 1, c2, c3, c1);
 874     r[4] = c2;
 875     c2 = 0;
 876     sqr_add_c2(a, 3, 2, c3, c1, c2);
 877     r[5] = c3;
 878     c3 = 0;
 879     sqr_add_c(a, 3, c1, c2, c3);
 880     r[6] = c1;
 881     r[7] = c2;
 882 }
 883
 884 # ifdef OPENSSL_NO_ASM
 885 #  ifdef OPENSSL_BN_ASM_MONT
 886 #   include <alloca.h>
 887 /*
 888  * This is essentially reference implementation, which may or may not
 889  * result in performance improvement. E.g. on IA-32 this routine was
 890  * observed to give 40% faster rsa1024 private key operations and 10%
 891  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
 892  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
 893  * reference implementation, one to be used as starting point for
 894  * platform-specific assembler. Mentioned numbers apply to compiler
 895  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
 896  * can vary not only from platform to platform, but even for compiler
 897  * versions. Assembler vs. assembler improvement coefficients can
 898  * [and are known to] differ and are to be documented elsewhere.
 899  */
 900 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 901                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
 902 {
 903     BN_ULONG c0, c1, ml, *tp, n0;
 904 #   ifdef mul64
 905     BN_ULONG mh;
 906 #   endif
 907     volatile BN_ULONG *vp;
 908     int i = 0, j;
 909
 910 #   if 0                        /* template for platform-specific
 911                                  * implementation */
 912     if (ap == bp)
 913         return bn_sqr_mont(rp, ap, np, n0p, num);
 914 #   endif
 915     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
 916
 917     n0 = *n0p;
 918
 919     c0 = 0;
 920     ml = bp[0];
 921 #   ifdef mul64
 922     mh = HBITS(ml);
 923     ml = LBITS(ml);
 924     for (j = 0; j < num; ++j)
 925         mul(tp[j], ap[j], ml, mh, c0);
 926 #   else
 927     for (j = 0; j < num; ++j)
 928         mul(tp[j], ap[j], ml, c0);
 929 #   endif
 930
 931     tp[num] = c0;
 932     tp[num + 1] = 0;
 933     goto enter;
 934
 935     for (i = 0; i < num; i++) {
 936         c0 = 0;
 937         ml = bp[i];
 938 #   ifdef mul64
 939         mh = HBITS(ml);
 940         ml = LBITS(ml);
 941         for (j = 0; j < num; ++j)
 942             mul_add(tp[j], ap[j], ml, mh, c0);
 943 #   else
 944         for (j = 0; j < num; ++j)
 945             mul_add(tp[j], ap[j], ml, c0);
 946 #   endif
 947         c1 = (tp[num] + c0) & BN_MASK2;
 948         tp[num] = c1;
 949         tp[num + 1] = (c1 < c0 ? 1 : 0);
 950  enter:
 951         c1 = tp[0];
 952         ml = (c1 * n0) & BN_MASK2;
 953         c0 = 0;
 954 #   ifdef mul64
 955         mh = HBITS(ml);
 956         ml = LBITS(ml);
 957         mul_add(c1, np[0], ml, mh, c0);
 958 #   else
 959         mul_add(c1, ml, np[0], c0);
 960 #   endif
 961         for (j = 1; j < num; j++) {
 962             c1 = tp[j];
 963 #   ifdef mul64
 964             mul_add(c1, np[j], ml, mh, c0);
 965 #   else
 966             mul_add(c1, ml, np[j], c0);
 967 #   endif
 968             tp[j - 1] = c1 & BN_MASK2;
 969         }
 970         c1 = (tp[num] + c0) & BN_MASK2;
 971         tp[num - 1] = c1;
 972         tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
 973     }
 974
 975     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
 976         c0 = bn_sub_words(rp, tp, np, num);
 977         if (tp[num] != 0 || c0 == 0) {
 978             for (i = 0; i < num + 2; i++)
 979                 vp[i] = 0;
 980             return 1;
 981         }
 982     }
 983     for (i = 0; i < num; i++)
 984         rp[i] = tp[i], vp[i] = 0;
 985     vp[num] = 0;
 986     vp[num + 1] = 0;
 987     return 1;
 988 }
 989 #  else
 990 /*
 991  * Return value of 0 indicates that multiplication/convolution was not
 992  * performed to signal the caller to fall down to alternative/original
 993  * code-path.
 994  */
 995 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 996                 const BN_ULONG *np, const BN_ULONG *n0, int num)
 997 {
 998     return 0;
 999 }
1000 #  endif                        /* OPENSSL_BN_ASM_MONT */
1001 # endif
1002
1003 #else                           /* !BN_MUL_COMBA */
1004
1005 /* hmm... is it faster just to do a multiply? */
1006 # undef bn_sqr_comba4
1007 # undef bn_sqr_comba8
1008 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1009 {
1010     BN_ULONG t[8];
1011     bn_sqr_normal(r, a, 4, t);
1012 }
1013
1014 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1015 {
1016     BN_ULONG t[16];
1017     bn_sqr_normal(r, a, 8, t);
1018 }
1019
1020 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1021 {
1022     r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1023     r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1024     r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1025     r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1026 }
1027
1028 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1029 {
1030     r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1031     r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1032     r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1033     r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1034     r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1035     r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1036     r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1037     r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1038 }
1039
1040 # ifdef OPENSSL_NO_ASM
1041 #  ifdef OPENSSL_BN_ASM_MONT
1042 #   include <alloca.h>
1043 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1044                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
1045 {
1046     BN_ULONG c0, c1, *tp, n0 = *n0p;
1047     volatile BN_ULONG *vp;
1048     int i = 0, j;
1049
1050     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1051
1052     for (i = 0; i <= num; i++)
1053         tp[i] = 0;
1054
1055     for (i = 0; i < num; i++) {
1056         c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1057         c1 = (tp[num] + c0) & BN_MASK2;
1058         tp[num] = c1;
1059         tp[num + 1] = (c1 < c0 ? 1 : 0);
1060
1061         c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1062         c1 = (tp[num] + c0) & BN_MASK2;
1063         tp[num] = c1;
1064         tp[num + 1] += (c1 < c0 ? 1 : 0);
1065         for (j = 0; j <= num; j++)
1066             tp[j] = tp[j + 1];
1067     }
1068
1069     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1070         c0 = bn_sub_words(rp, tp, np, num);
1071         if (tp[num] != 0 || c0 == 0) {
1072             for (i = 0; i < num + 2; i++)
1073                 vp[i] = 0;
1074             return 1;
1075         }
1076     }
1077     for (i = 0; i < num; i++)
1078         rp[i] = tp[i], vp[i] = 0;
1079     vp[num] = 0;
1080     vp[num + 1] = 0;
1081     return 1;
1082 }
1083 #  else
1084 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1085                 const BN_ULONG *np, const BN_ULONG *n0, int num)
1086 {
1087     return 0;
1088 }
1089 #  endif                        /* OPENSSL_BN_ASM_MONT */
1090 # endif
1091
1092 #endif                          /* !BN_MUL_COMBA */