crypto/ec/ecp_nistp256.c

   1 /*
   2  * Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/opensslconf.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <stdint.h>
  40 # include <string.h>
  41 # include <openssl/err.h>
  42 # include "ec_lcl.h"
  43
  44 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  45   /* even with gcc, the typedef won't work for 32-bit platforms */
  46 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  47                                  * platforms */
  48 typedef __int128_t int128_t;
  49 # else
  50 #  error "Your compiler doesn't appear to support 128-bit integer types"
  51 # endif
  52
  53 typedef uint8_t u8;
  54 typedef uint32_t u32;
  55 typedef uint64_t u64;
  56
  57 /*
  58  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  59  * can serialise an element of this field into 32 bytes. We call this an
  60  * felem_bytearray.
  61  */
  62
  63 typedef u8 felem_bytearray[32];
  64
  65 /*
  66  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  67  * values are big-endian.
  68  */
  69 static const felem_bytearray nistp256_curve_params[5] = {
  70     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  71      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  72      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  74     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  75      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  76      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  78     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  79      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  80      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  81      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  82     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  83      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  84      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  85      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  86     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  87      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  88      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  89      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  90 };
  91
  92 /*-
  93  * The representation of field elements.
  94  * ------------------------------------
  95  *
  96  * We represent field elements with either four 128-bit values, eight 128-bit
  97  * values, or four 64-bit values. The field element represented is:
  98  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  99  * or:
 100  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
 101  *
 102  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
 103  * apart, but are 128-bits wide, the most significant bits of each limb overlap
 104  * with the least significant bits of the next.
 105  *
 106  * A field element with four limbs is an 'felem'. One with eight limbs is a
 107  * 'longfelem'
 108  *
 109  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 110  * values are used as intermediate values before multiplication.
 111  */
 112
 113 # define NLIMBS 4
 114
 115 typedef uint128_t limb;
 116 typedef limb felem[NLIMBS];
 117 typedef limb longfelem[NLIMBS * 2];
 118 typedef u64 smallfelem[NLIMBS];
 119
 120 /* This is the value of the prime as four 64-bit words, little-endian. */
 121 static const u64 kPrime[4] =
 122     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 123 static const u64 bottom63bits = 0x7ffffffffffffffful;
 124
 125 /*
 126  * bin32_to_felem takes a little-endian byte array and converts it into felem
 127  * form. This assumes that the CPU is little-endian.
 128  */
 129 static void bin32_to_felem(felem out, const u8 in[32])
 130 {
 131     out[0] = *((u64 *)&in[0]);
 132     out[1] = *((u64 *)&in[8]);
 133     out[2] = *((u64 *)&in[16]);
 134     out[3] = *((u64 *)&in[24]);
 135 }
 136
 137 /*
 138  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 139  * endian, 32 byte array. This assumes that the CPU is little-endian.
 140  */
 141 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 142 {
 143     *((u64 *)&out[0]) = in[0];
 144     *((u64 *)&out[8]) = in[1];
 145     *((u64 *)&out[16]) = in[2];
 146     *((u64 *)&out[24]) = in[3];
 147 }
 148
 149 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 150 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 151 {
 152     unsigned i;
 153     for (i = 0; i < len; ++i)
 154         out[i] = in[len - 1 - i];
 155 }
 156
 157 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 158 static int BN_to_felem(felem out, const BIGNUM *bn)
 159 {
 160     felem_bytearray b_in;
 161     felem_bytearray b_out;
 162     unsigned num_bytes;
 163
 164     /* BN_bn2bin eats leading zeroes */
 165     memset(b_out, 0, sizeof(b_out));
 166     num_bytes = BN_num_bytes(bn);
 167     if (num_bytes > sizeof(b_out)) {
 168         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 169         return 0;
 170     }
 171     if (BN_is_negative(bn)) {
 172         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 173         return 0;
 174     }
 175     num_bytes = BN_bn2bin(bn, b_in);
 176     flip_endian(b_out, b_in, num_bytes);
 177     bin32_to_felem(out, b_out);
 178     return 1;
 179 }
 180
 181 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 182 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 183 {
 184     felem_bytearray b_in, b_out;
 185     smallfelem_to_bin32(b_in, in);
 186     flip_endian(b_out, b_in, sizeof(b_out));
 187     return BN_bin2bn(b_out, sizeof(b_out), out);
 188 }
 189
 190 /*-
 191  * Field operations
 192  * ----------------
 193  */
 194
 195 static void smallfelem_one(smallfelem out)
 196 {
 197     out[0] = 1;
 198     out[1] = 0;
 199     out[2] = 0;
 200     out[3] = 0;
 201 }
 202
 203 static void smallfelem_assign(smallfelem out, const smallfelem in)
 204 {
 205     out[0] = in[0];
 206     out[1] = in[1];
 207     out[2] = in[2];
 208     out[3] = in[3];
 209 }
 210
 211 static void felem_assign(felem out, const felem in)
 212 {
 213     out[0] = in[0];
 214     out[1] = in[1];
 215     out[2] = in[2];
 216     out[3] = in[3];
 217 }
 218
 219 /* felem_sum sets out = out + in. */
 220 static void felem_sum(felem out, const felem in)
 221 {
 222     out[0] += in[0];
 223     out[1] += in[1];
 224     out[2] += in[2];
 225     out[3] += in[3];
 226 }
 227
 228 /* felem_small_sum sets out = out + in. */
 229 static void felem_small_sum(felem out, const smallfelem in)
 230 {
 231     out[0] += in[0];
 232     out[1] += in[1];
 233     out[2] += in[2];
 234     out[3] += in[3];
 235 }
 236
 237 /* felem_scalar sets out = out * scalar */
 238 static void felem_scalar(felem out, const u64 scalar)
 239 {
 240     out[0] *= scalar;
 241     out[1] *= scalar;
 242     out[2] *= scalar;
 243     out[3] *= scalar;
 244 }
 245
 246 /* longfelem_scalar sets out = out * scalar */
 247 static void longfelem_scalar(longfelem out, const u64 scalar)
 248 {
 249     out[0] *= scalar;
 250     out[1] *= scalar;
 251     out[2] *= scalar;
 252     out[3] *= scalar;
 253     out[4] *= scalar;
 254     out[5] *= scalar;
 255     out[6] *= scalar;
 256     out[7] *= scalar;
 257 }
 258
 259 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 260 # define two105 (((limb)1) << 105)
 261 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 262
 263 /* zero105 is 0 mod p */
 264 static const felem zero105 =
 265     { two105m41m9, two105, two105m41p9, two105m41p9 };
 266
 267 /*-
 268  * smallfelem_neg sets |out| to |-small|
 269  * On exit:
 270  *   out[i] < out[i] + 2^105
 271  */
 272 static void smallfelem_neg(felem out, const smallfelem small)
 273 {
 274     /* In order to prevent underflow, we subtract from 0 mod p. */
 275     out[0] = zero105[0] - small[0];
 276     out[1] = zero105[1] - small[1];
 277     out[2] = zero105[2] - small[2];
 278     out[3] = zero105[3] - small[3];
 279 }
 280
 281 /*-
 282  * felem_diff subtracts |in| from |out|
 283  * On entry:
 284  *   in[i] < 2^104
 285  * On exit:
 286  *   out[i] < out[i] + 2^105
 287  */
 288 static void felem_diff(felem out, const felem in)
 289 {
 290     /*
 291      * In order to prevent underflow, we add 0 mod p before subtracting.
 292      */
 293     out[0] += zero105[0];
 294     out[1] += zero105[1];
 295     out[2] += zero105[2];
 296     out[3] += zero105[3];
 297
 298     out[0] -= in[0];
 299     out[1] -= in[1];
 300     out[2] -= in[2];
 301     out[3] -= in[3];
 302 }
 303
 304 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 305 # define two107 (((limb)1) << 107)
 306 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 307
 308 /* zero107 is 0 mod p */
 309 static const felem zero107 =
 310     { two107m43m11, two107, two107m43p11, two107m43p11 };
 311
 312 /*-
 313  * An alternative felem_diff for larger inputs |in|
 314  * felem_diff_zero107 subtracts |in| from |out|
 315  * On entry:
 316  *   in[i] < 2^106
 317  * On exit:
 318  *   out[i] < out[i] + 2^107
 319  */
 320 static void felem_diff_zero107(felem out, const felem in)
 321 {
 322     /*
 323      * In order to prevent underflow, we add 0 mod p before subtracting.
 324      */
 325     out[0] += zero107[0];
 326     out[1] += zero107[1];
 327     out[2] += zero107[2];
 328     out[3] += zero107[3];
 329
 330     out[0] -= in[0];
 331     out[1] -= in[1];
 332     out[2] -= in[2];
 333     out[3] -= in[3];
 334 }
 335
 336 /*-
 337  * longfelem_diff subtracts |in| from |out|
 338  * On entry:
 339  *   in[i] < 7*2^67
 340  * On exit:
 341  *   out[i] < out[i] + 2^70 + 2^40
 342  */
 343 static void longfelem_diff(longfelem out, const longfelem in)
 344 {
 345     static const limb two70m8p6 =
 346         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 347     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 348     static const limb two70 = (((limb) 1) << 70);
 349     static const limb two70m40m38p6 =
 350         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 351         (((limb) 1) << 6);
 352     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 353
 354     /* add 0 mod p to avoid underflow */
 355     out[0] += two70m8p6;
 356     out[1] += two70p40;
 357     out[2] += two70;
 358     out[3] += two70m40m38p6;
 359     out[4] += two70m6;
 360     out[5] += two70m6;
 361     out[6] += two70m6;
 362     out[7] += two70m6;
 363
 364     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 365     out[0] -= in[0];
 366     out[1] -= in[1];
 367     out[2] -= in[2];
 368     out[3] -= in[3];
 369     out[4] -= in[4];
 370     out[5] -= in[5];
 371     out[6] -= in[6];
 372     out[7] -= in[7];
 373 }
 374
 375 # define two64m0 (((limb)1) << 64) - 1
 376 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 377 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 378 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 379
 380 /* zero110 is 0 mod p */
 381 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 382
 383 /*-
 384  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 385  * minimal as the value may be greater than p.
 386  *
 387  * On entry:
 388  *   in[i] < 2^109
 389  * On exit:
 390  *   out[i] < 2^64
 391  */
 392 static void felem_shrink(smallfelem out, const felem in)
 393 {
 394     felem tmp;
 395     u64 a, b, mask;
 396     u64 high, low;
 397     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 398
 399     /* Carry 2->3 */
 400     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 401     /* tmp[3] < 2^110 */
 402
 403     tmp[2] = zero110[2] + (u64)in[2];
 404     tmp[0] = zero110[0] + in[0];
 405     tmp[1] = zero110[1] + in[1];
 406     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 407
 408     /*
 409      * We perform two partial reductions where we eliminate the high-word of
 410      * tmp[3]. We don't update the other words till the end.
 411      */
 412     a = tmp[3] >> 64;           /* a < 2^46 */
 413     tmp[3] = (u64)tmp[3];
 414     tmp[3] -= a;
 415     tmp[3] += ((limb) a) << 32;
 416     /* tmp[3] < 2^79 */
 417
 418     b = a;
 419     a = tmp[3] >> 64;           /* a < 2^15 */
 420     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 421     tmp[3] = (u64)tmp[3];
 422     tmp[3] -= a;
 423     tmp[3] += ((limb) a) << 32;
 424     /* tmp[3] < 2^64 + 2^47 */
 425
 426     /*
 427      * This adjusts the other two words to complete the two partial
 428      * reductions.
 429      */
 430     tmp[0] += b;
 431     tmp[1] -= (((limb) b) << 32);
 432
 433     /*
 434      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 435      * conditionally subtract kPrime if tmp[3] is large enough.
 436      */
 437     high = (u64)(tmp[3] >> 64);
 438     /* As tmp[3] < 2^65, high is either 1 or 0 */
 439     high = 0 - high;
 440     /*-
 441      * high is:
 442      *   all ones   if the high word of tmp[3] is 1
 443      *   all zeros  if the high word of tmp[3] if 0
 444      */
 445     low = (u64)tmp[3];
 446     mask = 0 - (low >> 63);
 447     /*-
 448      * mask is:
 449      *   all ones   if the MSB of low is 1
 450      *   all zeros  if the MSB of low if 0
 451      */
 452     low &= bottom63bits;
 453     low -= kPrime3Test;
 454     /* if low was greater than kPrime3Test then the MSB is zero */
 455     low = ~low;
 456     low = 0 - (low >> 63);
 457     /*-
 458      * low is:
 459      *   all ones   if low was > kPrime3Test
 460      *   all zeros  if low was <= kPrime3Test
 461      */
 462     mask = (mask & low) | high;
 463     tmp[0] -= mask & kPrime[0];
 464     tmp[1] -= mask & kPrime[1];
 465     /* kPrime[2] is zero, so omitted */
 466     tmp[3] -= mask & kPrime[3];
 467     /* tmp[3] < 2**64 - 2**32 + 1 */
 468
 469     tmp[1] += ((u64)(tmp[0] >> 64));
 470     tmp[0] = (u64)tmp[0];
 471     tmp[2] += ((u64)(tmp[1] >> 64));
 472     tmp[1] = (u64)tmp[1];
 473     tmp[3] += ((u64)(tmp[2] >> 64));
 474     tmp[2] = (u64)tmp[2];
 475     /* tmp[i] < 2^64 */
 476
 477     out[0] = tmp[0];
 478     out[1] = tmp[1];
 479     out[2] = tmp[2];
 480     out[3] = tmp[3];
 481 }
 482
 483 /* smallfelem_expand converts a smallfelem to an felem */
 484 static void smallfelem_expand(felem out, const smallfelem in)
 485 {
 486     out[0] = in[0];
 487     out[1] = in[1];
 488     out[2] = in[2];
 489     out[3] = in[3];
 490 }
 491
 492 /*-
 493  * smallfelem_square sets |out| = |small|^2
 494  * On entry:
 495  *   small[i] < 2^64
 496  * On exit:
 497  *   out[i] < 7 * 2^64 < 2^67
 498  */
 499 static void smallfelem_square(longfelem out, const smallfelem small)
 500 {
 501     limb a;
 502     u64 high, low;
 503
 504     a = ((uint128_t) small[0]) * small[0];
 505     low = a;
 506     high = a >> 64;
 507     out[0] = low;
 508     out[1] = high;
 509
 510     a = ((uint128_t) small[0]) * small[1];
 511     low = a;
 512     high = a >> 64;
 513     out[1] += low;
 514     out[1] += low;
 515     out[2] = high;
 516
 517     a = ((uint128_t) small[0]) * small[2];
 518     low = a;
 519     high = a >> 64;
 520     out[2] += low;
 521     out[2] *= 2;
 522     out[3] = high;
 523
 524     a = ((uint128_t) small[0]) * small[3];
 525     low = a;
 526     high = a >> 64;
 527     out[3] += low;
 528     out[4] = high;
 529
 530     a = ((uint128_t) small[1]) * small[2];
 531     low = a;
 532     high = a >> 64;
 533     out[3] += low;
 534     out[3] *= 2;
 535     out[4] += high;
 536
 537     a = ((uint128_t) small[1]) * small[1];
 538     low = a;
 539     high = a >> 64;
 540     out[2] += low;
 541     out[3] += high;
 542
 543     a = ((uint128_t) small[1]) * small[3];
 544     low = a;
 545     high = a >> 64;
 546     out[4] += low;
 547     out[4] *= 2;
 548     out[5] = high;
 549
 550     a = ((uint128_t) small[2]) * small[3];
 551     low = a;
 552     high = a >> 64;
 553     out[5] += low;
 554     out[5] *= 2;
 555     out[6] = high;
 556     out[6] += high;
 557
 558     a = ((uint128_t) small[2]) * small[2];
 559     low = a;
 560     high = a >> 64;
 561     out[4] += low;
 562     out[5] += high;
 563
 564     a = ((uint128_t) small[3]) * small[3];
 565     low = a;
 566     high = a >> 64;
 567     out[6] += low;
 568     out[7] = high;
 569 }
 570
 571 /*-
 572  * felem_square sets |out| = |in|^2
 573  * On entry:
 574  *   in[i] < 2^109
 575  * On exit:
 576  *   out[i] < 7 * 2^64 < 2^67
 577  */
 578 static void felem_square(longfelem out, const felem in)
 579 {
 580     u64 small[4];
 581     felem_shrink(small, in);
 582     smallfelem_square(out, small);
 583 }
 584
 585 /*-
 586  * smallfelem_mul sets |out| = |small1| * |small2|
 587  * On entry:
 588  *   small1[i] < 2^64
 589  *   small2[i] < 2^64
 590  * On exit:
 591  *   out[i] < 7 * 2^64 < 2^67
 592  */
 593 static void smallfelem_mul(longfelem out, const smallfelem small1,
 594                            const smallfelem small2)
 595 {
 596     limb a;
 597     u64 high, low;
 598
 599     a = ((uint128_t) small1[0]) * small2[0];
 600     low = a;
 601     high = a >> 64;
 602     out[0] = low;
 603     out[1] = high;
 604
 605     a = ((uint128_t) small1[0]) * small2[1];
 606     low = a;
 607     high = a >> 64;
 608     out[1] += low;
 609     out[2] = high;
 610
 611     a = ((uint128_t) small1[1]) * small2[0];
 612     low = a;
 613     high = a >> 64;
 614     out[1] += low;
 615     out[2] += high;
 616
 617     a = ((uint128_t) small1[0]) * small2[2];
 618     low = a;
 619     high = a >> 64;
 620     out[2] += low;
 621     out[3] = high;
 622
 623     a = ((uint128_t) small1[1]) * small2[1];
 624     low = a;
 625     high = a >> 64;
 626     out[2] += low;
 627     out[3] += high;
 628
 629     a = ((uint128_t) small1[2]) * small2[0];
 630     low = a;
 631     high = a >> 64;
 632     out[2] += low;
 633     out[3] += high;
 634
 635     a = ((uint128_t) small1[0]) * small2[3];
 636     low = a;
 637     high = a >> 64;
 638     out[3] += low;
 639     out[4] = high;
 640
 641     a = ((uint128_t) small1[1]) * small2[2];
 642     low = a;
 643     high = a >> 64;
 644     out[3] += low;
 645     out[4] += high;
 646
 647     a = ((uint128_t) small1[2]) * small2[1];
 648     low = a;
 649     high = a >> 64;
 650     out[3] += low;
 651     out[4] += high;
 652
 653     a = ((uint128_t) small1[3]) * small2[0];
 654     low = a;
 655     high = a >> 64;
 656     out[3] += low;
 657     out[4] += high;
 658
 659     a = ((uint128_t) small1[1]) * small2[3];
 660     low = a;
 661     high = a >> 64;
 662     out[4] += low;
 663     out[5] = high;
 664
 665     a = ((uint128_t) small1[2]) * small2[2];
 666     low = a;
 667     high = a >> 64;
 668     out[4] += low;
 669     out[5] += high;
 670
 671     a = ((uint128_t) small1[3]) * small2[1];
 672     low = a;
 673     high = a >> 64;
 674     out[4] += low;
 675     out[5] += high;
 676
 677     a = ((uint128_t) small1[2]) * small2[3];
 678     low = a;
 679     high = a >> 64;
 680     out[5] += low;
 681     out[6] = high;
 682
 683     a = ((uint128_t) small1[3]) * small2[2];
 684     low = a;
 685     high = a >> 64;
 686     out[5] += low;
 687     out[6] += high;
 688
 689     a = ((uint128_t) small1[3]) * small2[3];
 690     low = a;
 691     high = a >> 64;
 692     out[6] += low;
 693     out[7] = high;
 694 }
 695
 696 /*-
 697  * felem_mul sets |out| = |in1| * |in2|
 698  * On entry:
 699  *   in1[i] < 2^109
 700  *   in2[i] < 2^109
 701  * On exit:
 702  *   out[i] < 7 * 2^64 < 2^67
 703  */
 704 static void felem_mul(longfelem out, const felem in1, const felem in2)
 705 {
 706     smallfelem small1, small2;
 707     felem_shrink(small1, in1);
 708     felem_shrink(small2, in2);
 709     smallfelem_mul(out, small1, small2);
 710 }
 711
 712 /*-
 713  * felem_small_mul sets |out| = |small1| * |in2|
 714  * On entry:
 715  *   small1[i] < 2^64
 716  *   in2[i] < 2^109
 717  * On exit:
 718  *   out[i] < 7 * 2^64 < 2^67
 719  */
 720 static void felem_small_mul(longfelem out, const smallfelem small1,
 721                             const felem in2)
 722 {
 723     smallfelem small2;
 724     felem_shrink(small2, in2);
 725     smallfelem_mul(out, small1, small2);
 726 }
 727
 728 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 729 # define two100 (((limb)1) << 100)
 730 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 731 /* zero100 is 0 mod p */
 732 static const felem zero100 =
 733     { two100m36m4, two100, two100m36p4, two100m36p4 };
 734
 735 /*-
 736  * Internal function for the different flavours of felem_reduce.
 737  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 738  * On entry:
 739  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 740  *   out[1] >= in[7] + 2^32*in[4]
 741  *   out[2] >= in[5] + 2^32*in[5]
 742  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 743  * On exit:
 744  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 745  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 746  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 747  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 748  */
 749 static void felem_reduce_(felem out, const longfelem in)
 750 {
 751     int128_t c;
 752     /* combine common terms from below */
 753     c = in[4] + (in[5] << 32);
 754     out[0] += c;
 755     out[3] -= c;
 756
 757     c = in[5] - in[7];
 758     out[1] += c;
 759     out[2] -= c;
 760
 761     /* the remaining terms */
 762     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 763     out[1] -= (in[4] << 32);
 764     out[3] += (in[4] << 32);
 765
 766     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 767     out[2] -= (in[5] << 32);
 768
 769     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 770     out[0] -= in[6];
 771     out[0] -= (in[6] << 32);
 772     out[1] += (in[6] << 33);
 773     out[2] += (in[6] * 2);
 774     out[3] -= (in[6] << 32);
 775
 776     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 777     out[0] -= in[7];
 778     out[0] -= (in[7] << 32);
 779     out[2] += (in[7] << 33);
 780     out[3] += (in[7] * 3);
 781 }
 782
 783 /*-
 784  * felem_reduce converts a longfelem into an felem.
 785  * To be called directly after felem_square or felem_mul.
 786  * On entry:
 787  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 788  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 789  * On exit:
 790  *   out[i] < 2^101
 791  */
 792 static void felem_reduce(felem out, const longfelem in)
 793 {
 794     out[0] = zero100[0] + in[0];
 795     out[1] = zero100[1] + in[1];
 796     out[2] = zero100[2] + in[2];
 797     out[3] = zero100[3] + in[3];
 798
 799     felem_reduce_(out, in);
 800
 801     /*-
 802      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 803      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 804      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 805      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 806      *
 807      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 808      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 809      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 810      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 811      */
 812 }
 813
 814 /*-
 815  * felem_reduce_zero105 converts a larger longfelem into an felem.
 816  * On entry:
 817  *   in[0] < 2^71
 818  * On exit:
 819  *   out[i] < 2^106
 820  */
 821 static void felem_reduce_zero105(felem out, const longfelem in)
 822 {
 823     out[0] = zero105[0] + in[0];
 824     out[1] = zero105[1] + in[1];
 825     out[2] = zero105[2] + in[2];
 826     out[3] = zero105[3] + in[3];
 827
 828     felem_reduce_(out, in);
 829
 830     /*-
 831      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 832      * out[1] > 2^105 - 2^71 - 2^103 > 0
 833      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 834      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 835      *
 836      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 837      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 838      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 839      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 840      */
 841 }
 842
 843 /*
 844  * subtract_u64 sets *result = *result - v and *carry to one if the
 845  * subtraction underflowed.
 846  */
 847 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 848 {
 849     uint128_t r = *result;
 850     r -= v;
 851     *carry = (r >> 64) & 1;
 852     *result = (u64)r;
 853 }
 854
 855 /*
 856  * felem_contract converts |in| to its unique, minimal representation. On
 857  * entry: in[i] < 2^109
 858  */
 859 static void felem_contract(smallfelem out, const felem in)
 860 {
 861     unsigned i;
 862     u64 all_equal_so_far = 0, result = 0, carry;
 863
 864     felem_shrink(out, in);
 865     /* small is minimal except that the value might be > p */
 866
 867     all_equal_so_far--;
 868     /*
 869      * We are doing a constant time test if out >= kPrime. We need to compare
 870      * each u64, from most-significant to least significant. For each one, if
 871      * all words so far have been equal (m is all ones) then a non-equal
 872      * result is the answer. Otherwise we continue.
 873      */
 874     for (i = 3; i < 4; i--) {
 875         u64 equal;
 876         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 877         /*
 878          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 879          * will all be set.
 880          */
 881         result |= all_equal_so_far & ((u64)(a >> 64));
 882
 883         /*
 884          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 885          * decrement will make it all ones.
 886          */
 887         equal = kPrime[i] ^ out[i];
 888         equal--;
 889         equal &= equal << 32;
 890         equal &= equal << 16;
 891         equal &= equal << 8;
 892         equal &= equal << 4;
 893         equal &= equal << 2;
 894         equal &= equal << 1;
 895         equal = 0 - (equal >> 63);
 896
 897         all_equal_so_far &= equal;
 898     }
 899
 900     /*
 901      * if all_equal_so_far is still all ones then the two values are equal
 902      * and so out >= kPrime is true.
 903      */
 904     result |= all_equal_so_far;
 905
 906     /* if out >= kPrime then we subtract kPrime. */
 907     subtract_u64(&out[0], &carry, result & kPrime[0]);
 908     subtract_u64(&out[1], &carry, carry);
 909     subtract_u64(&out[2], &carry, carry);
 910     subtract_u64(&out[3], &carry, carry);
 911
 912     subtract_u64(&out[1], &carry, result & kPrime[1]);
 913     subtract_u64(&out[2], &carry, carry);
 914     subtract_u64(&out[3], &carry, carry);
 915
 916     subtract_u64(&out[2], &carry, result & kPrime[2]);
 917     subtract_u64(&out[3], &carry, carry);
 918
 919     subtract_u64(&out[3], &carry, result & kPrime[3]);
 920 }
 921
 922 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 923 {
 924     longfelem longtmp;
 925     felem tmp;
 926
 927     smallfelem_square(longtmp, in);
 928     felem_reduce(tmp, longtmp);
 929     felem_contract(out, tmp);
 930 }
 931
 932 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 933                                     const smallfelem in2)
 934 {
 935     longfelem longtmp;
 936     felem tmp;
 937
 938     smallfelem_mul(longtmp, in1, in2);
 939     felem_reduce(tmp, longtmp);
 940     felem_contract(out, tmp);
 941 }
 942
 943 /*-
 944  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 945  * otherwise.
 946  * On entry:
 947  *   small[i] < 2^64
 948  */
 949 static limb smallfelem_is_zero(const smallfelem small)
 950 {
 951     limb result;
 952     u64 is_p;
 953
 954     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 955     is_zero--;
 956     is_zero &= is_zero << 32;
 957     is_zero &= is_zero << 16;
 958     is_zero &= is_zero << 8;
 959     is_zero &= is_zero << 4;
 960     is_zero &= is_zero << 2;
 961     is_zero &= is_zero << 1;
 962     is_zero = 0 - (is_zero >> 63);
 963
 964     is_p = (small[0] ^ kPrime[0]) |
 965         (small[1] ^ kPrime[1]) |
 966         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 967     is_p--;
 968     is_p &= is_p << 32;
 969     is_p &= is_p << 16;
 970     is_p &= is_p << 8;
 971     is_p &= is_p << 4;
 972     is_p &= is_p << 2;
 973     is_p &= is_p << 1;
 974     is_p = 0 - (is_p >> 63);
 975
 976     is_zero |= is_p;
 977
 978     result = is_zero;
 979     result |= ((limb) is_zero) << 64;
 980     return result;
 981 }
 982
 983 static int smallfelem_is_zero_int(const void *small)
 984 {
 985     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 986 }
 987
 988 /*-
 989  * felem_inv calculates |out| = |in|^{-1}
 990  *
 991  * Based on Fermat's Little Theorem:
 992  *   a^p = a (mod p)
 993  *   a^{p-1} = 1 (mod p)
 994  *   a^{p-2} = a^{-1} (mod p)
 995  */
 996 static void felem_inv(felem out, const felem in)
 997 {
 998     felem ftmp, ftmp2;
 999     /* each e_I will hold |in|^{2^I - 1} */
1000     felem e2, e4, e8, e16, e32, e64;
1001     longfelem tmp;
1002     unsigned i;
1003
1004     felem_square(tmp, in);
1005     felem_reduce(ftmp, tmp);    /* 2^1 */
1006     felem_mul(tmp, in, ftmp);
1007     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1008     felem_assign(e2, ftmp);
1009     felem_square(tmp, ftmp);
1010     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1011     felem_square(tmp, ftmp);
1012     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1013     felem_mul(tmp, ftmp, e2);
1014     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1015     felem_assign(e4, ftmp);
1016     felem_square(tmp, ftmp);
1017     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1018     felem_square(tmp, ftmp);
1019     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1020     felem_square(tmp, ftmp);
1021     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1022     felem_square(tmp, ftmp);
1023     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1024     felem_mul(tmp, ftmp, e4);
1025     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1026     felem_assign(e8, ftmp);
1027     for (i = 0; i < 8; i++) {
1028         felem_square(tmp, ftmp);
1029         felem_reduce(ftmp, tmp);
1030     }                           /* 2^16 - 2^8 */
1031     felem_mul(tmp, ftmp, e8);
1032     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1033     felem_assign(e16, ftmp);
1034     for (i = 0; i < 16; i++) {
1035         felem_square(tmp, ftmp);
1036         felem_reduce(ftmp, tmp);
1037     }                           /* 2^32 - 2^16 */
1038     felem_mul(tmp, ftmp, e16);
1039     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1040     felem_assign(e32, ftmp);
1041     for (i = 0; i < 32; i++) {
1042         felem_square(tmp, ftmp);
1043         felem_reduce(ftmp, tmp);
1044     }                           /* 2^64 - 2^32 */
1045     felem_assign(e64, ftmp);
1046     felem_mul(tmp, ftmp, in);
1047     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1048     for (i = 0; i < 192; i++) {
1049         felem_square(tmp, ftmp);
1050         felem_reduce(ftmp, tmp);
1051     }                           /* 2^256 - 2^224 + 2^192 */
1052
1053     felem_mul(tmp, e64, e32);
1054     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1055     for (i = 0; i < 16; i++) {
1056         felem_square(tmp, ftmp2);
1057         felem_reduce(ftmp2, tmp);
1058     }                           /* 2^80 - 2^16 */
1059     felem_mul(tmp, ftmp2, e16);
1060     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1061     for (i = 0; i < 8; i++) {
1062         felem_square(tmp, ftmp2);
1063         felem_reduce(ftmp2, tmp);
1064     }                           /* 2^88 - 2^8 */
1065     felem_mul(tmp, ftmp2, e8);
1066     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1067     for (i = 0; i < 4; i++) {
1068         felem_square(tmp, ftmp2);
1069         felem_reduce(ftmp2, tmp);
1070     }                           /* 2^92 - 2^4 */
1071     felem_mul(tmp, ftmp2, e4);
1072     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1075     felem_square(tmp, ftmp2);
1076     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1077     felem_mul(tmp, ftmp2, e2);
1078     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1079     felem_square(tmp, ftmp2);
1080     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1081     felem_square(tmp, ftmp2);
1082     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1083     felem_mul(tmp, ftmp2, in);
1084     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1085
1086     felem_mul(tmp, ftmp2, ftmp);
1087     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1088 }
1089
1090 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1091 {
1092     felem tmp;
1093
1094     smallfelem_expand(tmp, in);
1095     felem_inv(tmp, tmp);
1096     felem_contract(out, tmp);
1097 }
1098
1099 /*-
1100  * Group operations
1101  * ----------------
1102  *
1103  * Building on top of the field operations we have the operations on the
1104  * elliptic curve group itself. Points on the curve are represented in Jacobian
1105  * coordinates
1106  */
1107
1108 /*-
1109  * point_double calculates 2*(x_in, y_in, z_in)
1110  *
1111  * The method is taken from:
1112  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1113  *
1114  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1115  * while x_out == y_in is not (maybe this works, but it's not tested).
1116  */
1117 static void
1118 point_double(felem x_out, felem y_out, felem z_out,
1119              const felem x_in, const felem y_in, const felem z_in)
1120 {
1121     longfelem tmp, tmp2;
1122     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1123     smallfelem small1, small2;
1124
1125     felem_assign(ftmp, x_in);
1126     /* ftmp[i] < 2^106 */
1127     felem_assign(ftmp2, x_in);
1128     /* ftmp2[i] < 2^106 */
1129
1130     /* delta = z^2 */
1131     felem_square(tmp, z_in);
1132     felem_reduce(delta, tmp);
1133     /* delta[i] < 2^101 */
1134
1135     /* gamma = y^2 */
1136     felem_square(tmp, y_in);
1137     felem_reduce(gamma, tmp);
1138     /* gamma[i] < 2^101 */
1139     felem_shrink(small1, gamma);
1140
1141     /* beta = x*gamma */
1142     felem_small_mul(tmp, small1, x_in);
1143     felem_reduce(beta, tmp);
1144     /* beta[i] < 2^101 */
1145
1146     /* alpha = 3*(x-delta)*(x+delta) */
1147     felem_diff(ftmp, delta);
1148     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1149     felem_sum(ftmp2, delta);
1150     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1151     felem_scalar(ftmp2, 3);
1152     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1153     felem_mul(tmp, ftmp, ftmp2);
1154     felem_reduce(alpha, tmp);
1155     /* alpha[i] < 2^101 */
1156     felem_shrink(small2, alpha);
1157
1158     /* x' = alpha^2 - 8*beta */
1159     smallfelem_square(tmp, small2);
1160     felem_reduce(x_out, tmp);
1161     felem_assign(ftmp, beta);
1162     felem_scalar(ftmp, 8);
1163     /* ftmp[i] < 8 * 2^101 = 2^104 */
1164     felem_diff(x_out, ftmp);
1165     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1166
1167     /* z' = (y + z)^2 - gamma - delta */
1168     felem_sum(delta, gamma);
1169     /* delta[i] < 2^101 + 2^101 = 2^102 */
1170     felem_assign(ftmp, y_in);
1171     felem_sum(ftmp, z_in);
1172     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1173     felem_square(tmp, ftmp);
1174     felem_reduce(z_out, tmp);
1175     felem_diff(z_out, delta);
1176     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1177
1178     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1179     felem_scalar(beta, 4);
1180     /* beta[i] < 4 * 2^101 = 2^103 */
1181     felem_diff_zero107(beta, x_out);
1182     /* beta[i] < 2^107 + 2^103 < 2^108 */
1183     felem_small_mul(tmp, small2, beta);
1184     /* tmp[i] < 7 * 2^64 < 2^67 */
1185     smallfelem_square(tmp2, small1);
1186     /* tmp2[i] < 7 * 2^64 */
1187     longfelem_scalar(tmp2, 8);
1188     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1189     longfelem_diff(tmp, tmp2);
1190     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1191     felem_reduce_zero105(y_out, tmp);
1192     /* y_out[i] < 2^106 */
1193 }
1194
1195 /*
1196  * point_double_small is the same as point_double, except that it operates on
1197  * smallfelems
1198  */
1199 static void
1200 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1201                    const smallfelem x_in, const smallfelem y_in,
1202                    const smallfelem z_in)
1203 {
1204     felem felem_x_out, felem_y_out, felem_z_out;
1205     felem felem_x_in, felem_y_in, felem_z_in;
1206
1207     smallfelem_expand(felem_x_in, x_in);
1208     smallfelem_expand(felem_y_in, y_in);
1209     smallfelem_expand(felem_z_in, z_in);
1210     point_double(felem_x_out, felem_y_out, felem_z_out,
1211                  felem_x_in, felem_y_in, felem_z_in);
1212     felem_shrink(x_out, felem_x_out);
1213     felem_shrink(y_out, felem_y_out);
1214     felem_shrink(z_out, felem_z_out);
1215 }
1216
1217 /* copy_conditional copies in to out iff mask is all ones. */
1218 static void copy_conditional(felem out, const felem in, limb mask)
1219 {
1220     unsigned i;
1221     for (i = 0; i < NLIMBS; ++i) {
1222         const limb tmp = mask & (in[i] ^ out[i]);
1223         out[i] ^= tmp;
1224     }
1225 }
1226
1227 /* copy_small_conditional copies in to out iff mask is all ones. */
1228 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1229 {
1230     unsigned i;
1231     const u64 mask64 = mask;
1232     for (i = 0; i < NLIMBS; ++i) {
1233         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1234     }
1235 }
1236
1237 /*-
1238  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1239  *
1240  * The method is taken from:
1241  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1242  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1243  *
1244  * This function includes a branch for checking whether the two input points
1245  * are equal, (while not equal to the point at infinity). This case never
1246  * happens during single point multiplication, so there is no timing leak for
1247  * ECDH or ECDSA signing.
1248  */
1249 static void point_add(felem x3, felem y3, felem z3,
1250                       const felem x1, const felem y1, const felem z1,
1251                       const int mixed, const smallfelem x2,
1252                       const smallfelem y2, const smallfelem z2)
1253 {
1254     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1255     longfelem tmp, tmp2;
1256     smallfelem small1, small2, small3, small4, small5;
1257     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1258
1259     felem_shrink(small3, z1);
1260
1261     z1_is_zero = smallfelem_is_zero(small3);
1262     z2_is_zero = smallfelem_is_zero(z2);
1263
1264     /* ftmp = z1z1 = z1**2 */
1265     smallfelem_square(tmp, small3);
1266     felem_reduce(ftmp, tmp);
1267     /* ftmp[i] < 2^101 */
1268     felem_shrink(small1, ftmp);
1269
1270     if (!mixed) {
1271         /* ftmp2 = z2z2 = z2**2 */
1272         smallfelem_square(tmp, z2);
1273         felem_reduce(ftmp2, tmp);
1274         /* ftmp2[i] < 2^101 */
1275         felem_shrink(small2, ftmp2);
1276
1277         felem_shrink(small5, x1);
1278
1279         /* u1 = ftmp3 = x1*z2z2 */
1280         smallfelem_mul(tmp, small5, small2);
1281         felem_reduce(ftmp3, tmp);
1282         /* ftmp3[i] < 2^101 */
1283
1284         /* ftmp5 = z1 + z2 */
1285         felem_assign(ftmp5, z1);
1286         felem_small_sum(ftmp5, z2);
1287         /* ftmp5[i] < 2^107 */
1288
1289         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1290         felem_square(tmp, ftmp5);
1291         felem_reduce(ftmp5, tmp);
1292         /* ftmp2 = z2z2 + z1z1 */
1293         felem_sum(ftmp2, ftmp);
1294         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1295         felem_diff(ftmp5, ftmp2);
1296         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1297
1298         /* ftmp2 = z2 * z2z2 */
1299         smallfelem_mul(tmp, small2, z2);
1300         felem_reduce(ftmp2, tmp);
1301
1302         /* s1 = ftmp2 = y1 * z2**3 */
1303         felem_mul(tmp, y1, ftmp2);
1304         felem_reduce(ftmp6, tmp);
1305         /* ftmp6[i] < 2^101 */
1306     } else {
1307         /*
1308          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1309          */
1310
1311         /* u1 = ftmp3 = x1*z2z2 */
1312         felem_assign(ftmp3, x1);
1313         /* ftmp3[i] < 2^106 */
1314
1315         /* ftmp5 = 2z1z2 */
1316         felem_assign(ftmp5, z1);
1317         felem_scalar(ftmp5, 2);
1318         /* ftmp5[i] < 2*2^106 = 2^107 */
1319
1320         /* s1 = ftmp2 = y1 * z2**3 */
1321         felem_assign(ftmp6, y1);
1322         /* ftmp6[i] < 2^106 */
1323     }
1324
1325     /* u2 = x2*z1z1 */
1326     smallfelem_mul(tmp, x2, small1);
1327     felem_reduce(ftmp4, tmp);
1328
1329     /* h = ftmp4 = u2 - u1 */
1330     felem_diff_zero107(ftmp4, ftmp3);
1331     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1332     felem_shrink(small4, ftmp4);
1333
1334     x_equal = smallfelem_is_zero(small4);
1335
1336     /* z_out = ftmp5 * h */
1337     felem_small_mul(tmp, small4, ftmp5);
1338     felem_reduce(z_out, tmp);
1339     /* z_out[i] < 2^101 */
1340
1341     /* ftmp = z1 * z1z1 */
1342     smallfelem_mul(tmp, small1, small3);
1343     felem_reduce(ftmp, tmp);
1344
1345     /* s2 = tmp = y2 * z1**3 */
1346     felem_small_mul(tmp, y2, ftmp);
1347     felem_reduce(ftmp5, tmp);
1348
1349     /* r = ftmp5 = (s2 - s1)*2 */
1350     felem_diff_zero107(ftmp5, ftmp6);
1351     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1352     felem_scalar(ftmp5, 2);
1353     /* ftmp5[i] < 2^109 */
1354     felem_shrink(small1, ftmp5);
1355     y_equal = smallfelem_is_zero(small1);
1356
1357     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1358         point_double(x3, y3, z3, x1, y1, z1);
1359         return;
1360     }
1361
1362     /* I = ftmp = (2h)**2 */
1363     felem_assign(ftmp, ftmp4);
1364     felem_scalar(ftmp, 2);
1365     /* ftmp[i] < 2*2^108 = 2^109 */
1366     felem_square(tmp, ftmp);
1367     felem_reduce(ftmp, tmp);
1368
1369     /* J = ftmp2 = h * I */
1370     felem_mul(tmp, ftmp4, ftmp);
1371     felem_reduce(ftmp2, tmp);
1372
1373     /* V = ftmp4 = U1 * I */
1374     felem_mul(tmp, ftmp3, ftmp);
1375     felem_reduce(ftmp4, tmp);
1376
1377     /* x_out = r**2 - J - 2V */
1378     smallfelem_square(tmp, small1);
1379     felem_reduce(x_out, tmp);
1380     felem_assign(ftmp3, ftmp4);
1381     felem_scalar(ftmp4, 2);
1382     felem_sum(ftmp4, ftmp2);
1383     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1384     felem_diff(x_out, ftmp4);
1385     /* x_out[i] < 2^105 + 2^101 */
1386
1387     /* y_out = r(V-x_out) - 2 * s1 * J */
1388     felem_diff_zero107(ftmp3, x_out);
1389     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1390     felem_small_mul(tmp, small1, ftmp3);
1391     felem_mul(tmp2, ftmp6, ftmp2);
1392     longfelem_scalar(tmp2, 2);
1393     /* tmp2[i] < 2*2^67 = 2^68 */
1394     longfelem_diff(tmp, tmp2);
1395     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1396     felem_reduce_zero105(y_out, tmp);
1397     /* y_out[i] < 2^106 */
1398
1399     copy_small_conditional(x_out, x2, z1_is_zero);
1400     copy_conditional(x_out, x1, z2_is_zero);
1401     copy_small_conditional(y_out, y2, z1_is_zero);
1402     copy_conditional(y_out, y1, z2_is_zero);
1403     copy_small_conditional(z_out, z2, z1_is_zero);
1404     copy_conditional(z_out, z1, z2_is_zero);
1405     felem_assign(x3, x_out);
1406     felem_assign(y3, y_out);
1407     felem_assign(z3, z_out);
1408 }
1409
1410 /*
1411  * point_add_small is the same as point_add, except that it operates on
1412  * smallfelems
1413  */
1414 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1415                             smallfelem x1, smallfelem y1, smallfelem z1,
1416                             smallfelem x2, smallfelem y2, smallfelem z2)
1417 {
1418     felem felem_x3, felem_y3, felem_z3;
1419     felem felem_x1, felem_y1, felem_z1;
1420     smallfelem_expand(felem_x1, x1);
1421     smallfelem_expand(felem_y1, y1);
1422     smallfelem_expand(felem_z1, z1);
1423     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1424               x2, y2, z2);
1425     felem_shrink(x3, felem_x3);
1426     felem_shrink(y3, felem_y3);
1427     felem_shrink(z3, felem_z3);
1428 }
1429
1430 /*-
1431  * Base point pre computation
1432  * --------------------------
1433  *
1434  * Two different sorts of precomputed tables are used in the following code.
1435  * Each contain various points on the curve, where each point is three field
1436  * elements (x, y, z).
1437  *
1438  * For the base point table, z is usually 1 (0 for the point at infinity).
1439  * This table has 2 * 16 elements, starting with the following:
1440  * index | bits    | point
1441  * ------+---------+------------------------------
1442  *     0 | 0 0 0 0 | 0G
1443  *     1 | 0 0 0 1 | 1G
1444  *     2 | 0 0 1 0 | 2^64G
1445  *     3 | 0 0 1 1 | (2^64 + 1)G
1446  *     4 | 0 1 0 0 | 2^128G
1447  *     5 | 0 1 0 1 | (2^128 + 1)G
1448  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1449  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1450  *     8 | 1 0 0 0 | 2^192G
1451  *     9 | 1 0 0 1 | (2^192 + 1)G
1452  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1453  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1454  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1455  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1456  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1457  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1458  * followed by a copy of this with each element multiplied by 2^32.
1459  *
1460  * The reason for this is so that we can clock bits into four different
1461  * locations when doing simple scalar multiplies against the base point,
1462  * and then another four locations using the second 16 elements.
1463  *
1464  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1465
1466 /* gmul is the table of precomputed base points */
1467 static const smallfelem gmul[2][16][3] = {
1468     {{{0, 0, 0, 0},
1469       {0, 0, 0, 0},
1470       {0, 0, 0, 0}},
1471      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1472        0x6b17d1f2e12c4247},
1473       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1474        0x4fe342e2fe1a7f9b},
1475       {1, 0, 0, 0}},
1476      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1477        0x0fa822bc2811aaa5},
1478       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1479        0xbff44ae8f5dba80d},
1480       {1, 0, 0, 0}},
1481      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1482        0x300a4bbc89d6726f},
1483       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1484        0x72aac7e0d09b4644},
1485       {1, 0, 0, 0}},
1486      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1487        0x447d739beedb5e67},
1488       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1489        0x2d4825ab834131ee},
1490       {1, 0, 0, 0}},
1491      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1492        0xef9519328a9c72ff},
1493       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1494        0x611e9fc37dbb2c9b},
1495       {1, 0, 0, 0}},
1496      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1497        0x550663797b51f5d8},
1498       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1499        0x157164848aecb851},
1500       {1, 0, 0, 0}},
1501      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1502        0xeb5d7745b21141ea},
1503       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1504        0xeafd72ebdbecc17b},
1505       {1, 0, 0, 0}},
1506      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1507        0xa6d39677a7849276},
1508       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1509        0x674f84749b0b8816},
1510       {1, 0, 0, 0}},
1511      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1512        0x4e769e7672c9ddad},
1513       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1514        0x42b99082de830663},
1515       {1, 0, 0, 0}},
1516      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1517        0x78878ef61c6ce04d},
1518       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1519        0xb6cb3f5d7b72c321},
1520       {1, 0, 0, 0}},
1521      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1522        0x0c88bc4d716b1287},
1523       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1524        0xdd5ddea3f3901dc6},
1525       {1, 0, 0, 0}},
1526      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1527        0x68f344af6b317466},
1528       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1529        0x31b9c405f8540a20},
1530       {1, 0, 0, 0}},
1531      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1532        0x4052bf4b6f461db9},
1533       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1534        0xfecf4d5190b0fc61},
1535       {1, 0, 0, 0}},
1536      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1537        0x1eddbae2c802e41a},
1538       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1539        0x43104d86560ebcfc},
1540       {1, 0, 0, 0}},
1541      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1542        0xb48e26b484f7a21c},
1543       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1544        0xfac015404d4d3dab},
1545       {1, 0, 0, 0}}},
1546     {{{0, 0, 0, 0},
1547       {0, 0, 0, 0},
1548       {0, 0, 0, 0}},
1549      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1550        0x7fe36b40af22af89},
1551       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1552        0xe697d45825b63624},
1553       {1, 0, 0, 0}},
1554      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1555        0x4a5b506612a677a6},
1556       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1557        0xeb13461ceac089f1},
1558       {1, 0, 0, 0}},
1559      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1560        0x0781b8291c6a220a},
1561       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1562        0x690cde8df0151593},
1563       {1, 0, 0, 0}},
1564      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1565        0x8a535f566ec73617},
1566       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1567        0x0455c08468b08bd7},
1568       {1, 0, 0, 0}},
1569      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1570        0x06bada7ab77f8276},
1571       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1572        0x5b476dfd0e6cb18a},
1573       {1, 0, 0, 0}},
1574      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1575        0x3e29864e8a2ec908},
1576       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1577        0x239b90ea3dc31e7e},
1578       {1, 0, 0, 0}},
1579      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1580        0x820f4dd949f72ff7},
1581       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1582        0x140406ec783a05ec},
1583       {1, 0, 0, 0}},
1584      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1585        0x68f6b8542783dfee},
1586       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1587        0xcbe1feba92e40ce6},
1588       {1, 0, 0, 0}},
1589      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1590        0xd0b2f94d2f420109},
1591       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1592        0x971459828b0719e5},
1593       {1, 0, 0, 0}},
1594      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1595        0x961610004a866aba},
1596       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1597        0x7acb9fadcee75e44},
1598       {1, 0, 0, 0}},
1599      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1600        0x24eb9acca333bf5b},
1601       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1602        0x69f891c5acd079cc},
1603       {1, 0, 0, 0}},
1604      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1605        0xe51f547c5972a107},
1606       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1607        0x1c309a2b25bb1387},
1608       {1, 0, 0, 0}},
1609      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1610        0x20b87b8aa2c4e503},
1611       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1612        0xf5c6fa49919776be},
1613       {1, 0, 0, 0}},
1614      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1615        0x1ed7d1b9332010b9},
1616       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1617        0x3a2b03f03217257a},
1618       {1, 0, 0, 0}},
1619      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1620        0x15fee545c78dd9f6},
1621       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1622        0x4ab5b6b2b8753f81},
1623       {1, 0, 0, 0}}}
1624 };
1625
1626 /*
1627  * select_point selects the |idx|th point from a precomputation table and
1628  * copies it to out.
1629  */
1630 static void select_point(const u64 idx, unsigned int size,
1631                          const smallfelem pre_comp[16][3], smallfelem out[3])
1632 {
1633     unsigned i, j;
1634     u64 *outlimbs = &out[0][0];
1635
1636     memset(out, 0, sizeof(*out) * 3);
1637
1638     for (i = 0; i < size; i++) {
1639         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1640         u64 mask = i ^ idx;
1641         mask |= mask >> 4;
1642         mask |= mask >> 2;
1643         mask |= mask >> 1;
1644         mask &= 1;
1645         mask--;
1646         for (j = 0; j < NLIMBS * 3; j++)
1647             outlimbs[j] |= inlimbs[j] & mask;
1648     }
1649 }
1650
1651 /* get_bit returns the |i|th bit in |in| */
1652 static char get_bit(const felem_bytearray in, int i)
1653 {
1654     if ((i < 0) || (i >= 256))
1655         return 0;
1656     return (in[i >> 3] >> (i & 7)) & 1;
1657 }
1658
1659 /*
1660  * Interleaved point multiplication using precomputed point multiples: The
1661  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1662  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1663  * generator, using certain (large) precomputed multiples in g_pre_comp.
1664  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1665  */
1666 static void batch_mul(felem x_out, felem y_out, felem z_out,
1667                       const felem_bytearray scalars[],
1668                       const unsigned num_points, const u8 *g_scalar,
1669                       const int mixed, const smallfelem pre_comp[][17][3],
1670                       const smallfelem g_pre_comp[2][16][3])
1671 {
1672     int i, skip;
1673     unsigned num, gen_mul = (g_scalar != NULL);
1674     felem nq[3], ftmp;
1675     smallfelem tmp[3];
1676     u64 bits;
1677     u8 sign, digit;
1678
1679     /* set nq to the point at infinity */
1680     memset(nq, 0, sizeof(nq));
1681
1682     /*
1683      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1684      * of the generator (two in each of the last 32 rounds) and additions of
1685      * other points multiples (every 5th round).
1686      */
1687     skip = 1;                   /* save two point operations in the first
1688                                  * round */
1689     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1690         /* double */
1691         if (!skip)
1692             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1693
1694         /* add multiples of the generator */
1695         if (gen_mul && (i <= 31)) {
1696             /* first, look 32 bits upwards */
1697             bits = get_bit(g_scalar, i + 224) << 3;
1698             bits |= get_bit(g_scalar, i + 160) << 2;
1699             bits |= get_bit(g_scalar, i + 96) << 1;
1700             bits |= get_bit(g_scalar, i + 32);
1701             /* select the point to add, in constant time */
1702             select_point(bits, 16, g_pre_comp[1], tmp);
1703
1704             if (!skip) {
1705                 /* Arg 1 below is for "mixed" */
1706                 point_add(nq[0], nq[1], nq[2],
1707                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1708             } else {
1709                 smallfelem_expand(nq[0], tmp[0]);
1710                 smallfelem_expand(nq[1], tmp[1]);
1711                 smallfelem_expand(nq[2], tmp[2]);
1712                 skip = 0;
1713             }
1714
1715             /* second, look at the current position */
1716             bits = get_bit(g_scalar, i + 192) << 3;
1717             bits |= get_bit(g_scalar, i + 128) << 2;
1718             bits |= get_bit(g_scalar, i + 64) << 1;
1719             bits |= get_bit(g_scalar, i);
1720             /* select the point to add, in constant time */
1721             select_point(bits, 16, g_pre_comp[0], tmp);
1722             /* Arg 1 below is for "mixed" */
1723             point_add(nq[0], nq[1], nq[2],
1724                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1725         }
1726
1727         /* do other additions every 5 doublings */
1728         if (num_points && (i % 5 == 0)) {
1729             /* loop over all scalars */
1730             for (num = 0; num < num_points; ++num) {
1731                 bits = get_bit(scalars[num], i + 4) << 5;
1732                 bits |= get_bit(scalars[num], i + 3) << 4;
1733                 bits |= get_bit(scalars[num], i + 2) << 3;
1734                 bits |= get_bit(scalars[num], i + 1) << 2;
1735                 bits |= get_bit(scalars[num], i) << 1;
1736                 bits |= get_bit(scalars[num], i - 1);
1737                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1738
1739                 /*
1740                  * select the point to add or subtract, in constant time
1741                  */
1742                 select_point(digit, 17, pre_comp[num], tmp);
1743                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1744                                                * point */
1745                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1746                 felem_contract(tmp[1], ftmp);
1747
1748                 if (!skip) {
1749                     point_add(nq[0], nq[1], nq[2],
1750                               nq[0], nq[1], nq[2],
1751                               mixed, tmp[0], tmp[1], tmp[2]);
1752                 } else {
1753                     smallfelem_expand(nq[0], tmp[0]);
1754                     smallfelem_expand(nq[1], tmp[1]);
1755                     smallfelem_expand(nq[2], tmp[2]);
1756                     skip = 0;
1757                 }
1758             }
1759         }
1760     }
1761     felem_assign(x_out, nq[0]);
1762     felem_assign(y_out, nq[1]);
1763     felem_assign(z_out, nq[2]);
1764 }
1765
1766 /* Precomputation for the group generator. */
1767 struct nistp256_pre_comp_st {
1768     smallfelem g_pre_comp[2][16][3];
1769     CRYPTO_REF_COUNT references;
1770     CRYPTO_RWLOCK *lock;
1771 };
1772
1773 const EC_METHOD *EC_GFp_nistp256_method(void)
1774 {
1775     static const EC_METHOD ret = {
1776         EC_FLAGS_DEFAULT_OCT,
1777         NID_X9_62_prime_field,
1778         ec_GFp_nistp256_group_init,
1779         ec_GFp_simple_group_finish,
1780         ec_GFp_simple_group_clear_finish,
1781         ec_GFp_nist_group_copy,
1782         ec_GFp_nistp256_group_set_curve,
1783         ec_GFp_simple_group_get_curve,
1784         ec_GFp_simple_group_get_degree,
1785         ec_group_simple_order_bits,
1786         ec_GFp_simple_group_check_discriminant,
1787         ec_GFp_simple_point_init,
1788         ec_GFp_simple_point_finish,
1789         ec_GFp_simple_point_clear_finish,
1790         ec_GFp_simple_point_copy,
1791         ec_GFp_simple_point_set_to_infinity,
1792         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1793         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1794         ec_GFp_simple_point_set_affine_coordinates,
1795         ec_GFp_nistp256_point_get_affine_coordinates,
1796         0 /* point_set_compressed_coordinates */ ,
1797         0 /* point2oct */ ,
1798         0 /* oct2point */ ,
1799         ec_GFp_simple_add,
1800         ec_GFp_simple_dbl,
1801         ec_GFp_simple_invert,
1802         ec_GFp_simple_is_at_infinity,
1803         ec_GFp_simple_is_on_curve,
1804         ec_GFp_simple_cmp,
1805         ec_GFp_simple_make_affine,
1806         ec_GFp_simple_points_make_affine,
1807         ec_GFp_nistp256_points_mul,
1808         ec_GFp_nistp256_precompute_mult,
1809         ec_GFp_nistp256_have_precompute_mult,
1810         ec_GFp_nist_field_mul,
1811         ec_GFp_nist_field_sqr,
1812         0 /* field_div */ ,
1813         ec_GFp_simple_field_inv,
1814         0 /* field_encode */ ,
1815         0 /* field_decode */ ,
1816         0,                      /* field_set_to_one */
1817         ec_key_simple_priv2oct,
1818         ec_key_simple_oct2priv,
1819         0, /* set private */
1820         ec_key_simple_generate_key,
1821         ec_key_simple_check_key,
1822         ec_key_simple_generate_public_key,
1823         0, /* keycopy */
1824         0, /* keyfinish */
1825         ecdh_simple_compute_key,
1826         0, /* field_inverse_mod_ord */
1827         0, /* blind_coordinates */
1828         0, /* ladder_pre */
1829         0, /* ladder_step */
1830         0  /* ladder_post */
1831     };
1832
1833     return &ret;
1834 }
1835
1836 /******************************************************************************/
1837 /*
1838  * FUNCTIONS TO MANAGE PRECOMPUTATION
1839  */
1840
1841 static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
1842 {
1843     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1844
1845     if (ret == NULL) {
1846         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1847         return ret;
1848     }
1849
1850     ret->references = 1;
1851
1852     ret->lock = CRYPTO_THREAD_lock_new();
1853     if (ret->lock == NULL) {
1854         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1855         OPENSSL_free(ret);
1856         return NULL;
1857     }
1858     return ret;
1859 }
1860
1861 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1862 {
1863     int i;
1864     if (p != NULL)
1865         CRYPTO_UP_REF(&p->references, &i, p->lock);
1866     return p;
1867 }
1868
1869 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1870 {
1871     int i;
1872
1873     if (pre == NULL)
1874         return;
1875
1876     CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1877     REF_PRINT_COUNT("EC_nistp256", x);
1878     if (i > 0)
1879         return;
1880     REF_ASSERT_ISNT(i < 0);
1881
1882     CRYPTO_THREAD_lock_free(pre->lock);
1883     OPENSSL_free(pre);
1884 }
1885
1886 /******************************************************************************/
1887 /*
1888  * OPENSSL EC_METHOD FUNCTIONS
1889  */
1890
1891 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1892 {
1893     int ret;
1894     ret = ec_GFp_simple_group_init(group);
1895     group->a_is_minus3 = 1;
1896     return ret;
1897 }
1898
1899 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1900                                     const BIGNUM *a, const BIGNUM *b,
1901                                     BN_CTX *ctx)
1902 {
1903     int ret = 0;
1904     BIGNUM *curve_p, *curve_a, *curve_b;
1905 #ifndef FIPS_MODE
1906     BN_CTX *new_ctx = NULL;
1907
1908     if (ctx == NULL)
1909         new_ctx = BN_CTX_new();
1910 #endif
1911     if (ctx == NULL)
1912         return 0;
1913
1914     BN_CTX_start(ctx);
1915     curve_p = BN_CTX_get(ctx);
1916     curve_a = BN_CTX_get(ctx);
1917     curve_b = BN_CTX_get(ctx);
1918     if (curve_b == NULL)
1919         goto err;
1920     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1921     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1922     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1923     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1924         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1925               EC_R_WRONG_CURVE_PARAMETERS);
1926         goto err;
1927     }
1928     group->field_mod_func = BN_nist_mod_256;
1929     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1930  err:
1931     BN_CTX_end(ctx);
1932 #ifndef FIPS_MODE
1933     BN_CTX_free(new_ctx);
1934 #endif
1935     return ret;
1936 }
1937
1938 /*
1939  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1940  * (X/Z^2, Y/Z^3)
1941  */
1942 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1943                                                  const EC_POINT *point,
1944                                                  BIGNUM *x, BIGNUM *y,
1945                                                  BN_CTX *ctx)
1946 {
1947     felem z1, z2, x_in, y_in;
1948     smallfelem x_out, y_out;
1949     longfelem tmp;
1950
1951     if (EC_POINT_is_at_infinity(group, point)) {
1952         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1953               EC_R_POINT_AT_INFINITY);
1954         return 0;
1955     }
1956     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1957         (!BN_to_felem(z1, point->Z)))
1958         return 0;
1959     felem_inv(z2, z1);
1960     felem_square(tmp, z2);
1961     felem_reduce(z1, tmp);
1962     felem_mul(tmp, x_in, z1);
1963     felem_reduce(x_in, tmp);
1964     felem_contract(x_out, x_in);
1965     if (x != NULL) {
1966         if (!smallfelem_to_BN(x, x_out)) {
1967             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1968                   ERR_R_BN_LIB);
1969             return 0;
1970         }
1971     }
1972     felem_mul(tmp, z1, z2);
1973     felem_reduce(z1, tmp);
1974     felem_mul(tmp, y_in, z1);
1975     felem_reduce(y_in, tmp);
1976     felem_contract(y_out, y_in);
1977     if (y != NULL) {
1978         if (!smallfelem_to_BN(y, y_out)) {
1979             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1980                   ERR_R_BN_LIB);
1981             return 0;
1982         }
1983     }
1984     return 1;
1985 }
1986
1987 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1988 static void make_points_affine(size_t num, smallfelem points[][3],
1989                                smallfelem tmp_smallfelems[])
1990 {
1991     /*
1992      * Runs in constant time, unless an input is the point at infinity (which
1993      * normally shouldn't happen).
1994      */
1995     ec_GFp_nistp_points_make_affine_internal(num,
1996                                              points,
1997                                              sizeof(smallfelem),
1998                                              tmp_smallfelems,
1999                                              (void (*)(void *))smallfelem_one,
2000                                              smallfelem_is_zero_int,
2001                                              (void (*)(void *, const void *))
2002                                              smallfelem_assign,
2003                                              (void (*)(void *, const void *))
2004                                              smallfelem_square_contract,
2005                                              (void (*)
2006                                               (void *, const void *,
2007                                                const void *))
2008                                              smallfelem_mul_contract,
2009                                              (void (*)(void *, const void *))
2010                                              smallfelem_inv_contract,
2011                                              /* nothing to contract */
2012                                              (void (*)(void *, const void *))
2013                                              smallfelem_assign);
2014 }
2015
2016 /*
2017  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2018  * values Result is stored in r (r can equal one of the inputs).
2019  */
2020 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2021                                const BIGNUM *scalar, size_t num,
2022                                const EC_POINT *points[],
2023                                const BIGNUM *scalars[], BN_CTX *ctx)
2024 {
2025     int ret = 0;
2026     int j;
2027     int mixed = 0;
2028     BIGNUM *x, *y, *z, *tmp_scalar;
2029     felem_bytearray g_secret;
2030     felem_bytearray *secrets = NULL;
2031     smallfelem (*pre_comp)[17][3] = NULL;
2032     smallfelem *tmp_smallfelems = NULL;
2033     felem_bytearray tmp;
2034     unsigned i, num_bytes;
2035     int have_pre_comp = 0;
2036     size_t num_points = num;
2037     smallfelem x_in, y_in, z_in;
2038     felem x_out, y_out, z_out;
2039     NISTP256_PRE_COMP *pre = NULL;
2040     const smallfelem(*g_pre_comp)[16][3] = NULL;
2041     EC_POINT *generator = NULL;
2042     const EC_POINT *p = NULL;
2043     const BIGNUM *p_scalar = NULL;
2044
2045     BN_CTX_start(ctx);
2046     x = BN_CTX_get(ctx);
2047     y = BN_CTX_get(ctx);
2048     z = BN_CTX_get(ctx);
2049     tmp_scalar = BN_CTX_get(ctx);
2050     if (tmp_scalar == NULL)
2051         goto err;
2052
2053     if (scalar != NULL) {
2054         pre = group->pre_comp.nistp256;
2055         if (pre)
2056             /* we have precomputation, try to use it */
2057             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2058         else
2059             /* try to use the standard precomputation */
2060             g_pre_comp = &gmul[0];
2061         generator = EC_POINT_new(group);
2062         if (generator == NULL)
2063             goto err;
2064         /* get the generator from precomputation */
2065         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2066             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2067             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2068             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2069             goto err;
2070         }
2071         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2072                                                       generator, x, y, z,
2073                                                       ctx))
2074             goto err;
2075         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2076             /* precomputation matches generator */
2077             have_pre_comp = 1;
2078         else
2079             /*
2080              * we don't have valid precomputation: treat the generator as a
2081              * random point
2082              */
2083             num_points++;
2084     }
2085     if (num_points > 0) {
2086         if (num_points >= 3) {
2087             /*
2088              * unless we precompute multiples for just one or two points,
2089              * converting those into affine form is time well spent
2090              */
2091             mixed = 1;
2092         }
2093         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2094         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2095         if (mixed)
2096             tmp_smallfelems =
2097               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2098         if ((secrets == NULL) || (pre_comp == NULL)
2099             || (mixed && (tmp_smallfelems == NULL))) {
2100             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2101             goto err;
2102         }
2103
2104         /*
2105          * we treat NULL scalars as 0, and NULL points as points at infinity,
2106          * i.e., they contribute nothing to the linear combination
2107          */
2108         memset(secrets, 0, sizeof(*secrets) * num_points);
2109         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2110         for (i = 0; i < num_points; ++i) {
2111             if (i == num)
2112                 /*
2113                  * we didn't have a valid precomputation, so we pick the
2114                  * generator
2115                  */
2116             {
2117                 p = EC_GROUP_get0_generator(group);
2118                 p_scalar = scalar;
2119             } else
2120                 /* the i^th point */
2121             {
2122                 p = points[i];
2123                 p_scalar = scalars[i];
2124             }
2125             if ((p_scalar != NULL) && (p != NULL)) {
2126                 /* reduce scalar to 0 <= scalar < 2^256 */
2127                 if ((BN_num_bits(p_scalar) > 256)
2128                     || (BN_is_negative(p_scalar))) {
2129                     /*
2130                      * this is an unusual input, and we don't guarantee
2131                      * constant-timeness
2132                      */
2133                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2134                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2135                         goto err;
2136                     }
2137                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2138                 } else
2139                     num_bytes = BN_bn2bin(p_scalar, tmp);
2140                 flip_endian(secrets[i], tmp, num_bytes);
2141                 /* precompute multiples */
2142                 if ((!BN_to_felem(x_out, p->X)) ||
2143                     (!BN_to_felem(y_out, p->Y)) ||
2144                     (!BN_to_felem(z_out, p->Z)))
2145                     goto err;
2146                 felem_shrink(pre_comp[i][1][0], x_out);
2147                 felem_shrink(pre_comp[i][1][1], y_out);
2148                 felem_shrink(pre_comp[i][1][2], z_out);
2149                 for (j = 2; j <= 16; ++j) {
2150                     if (j & 1) {
2151                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2152                                         pre_comp[i][j][2], pre_comp[i][1][0],
2153                                         pre_comp[i][1][1], pre_comp[i][1][2],
2154                                         pre_comp[i][j - 1][0],
2155                                         pre_comp[i][j - 1][1],
2156                                         pre_comp[i][j - 1][2]);
2157                     } else {
2158                         point_double_small(pre_comp[i][j][0],
2159                                            pre_comp[i][j][1],
2160                                            pre_comp[i][j][2],
2161                                            pre_comp[i][j / 2][0],
2162                                            pre_comp[i][j / 2][1],
2163                                            pre_comp[i][j / 2][2]);
2164                     }
2165                 }
2166             }
2167         }
2168         if (mixed)
2169             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2170     }
2171
2172     /* the scalar for the generator */
2173     if ((scalar != NULL) && (have_pre_comp)) {
2174         memset(g_secret, 0, sizeof(g_secret));
2175         /* reduce scalar to 0 <= scalar < 2^256 */
2176         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2177             /*
2178              * this is an unusual input, and we don't guarantee
2179              * constant-timeness
2180              */
2181             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2182                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2183                 goto err;
2184             }
2185             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2186         } else
2187             num_bytes = BN_bn2bin(scalar, tmp);
2188         flip_endian(g_secret, tmp, num_bytes);
2189         /* do the multiplication with generator precomputation */
2190         batch_mul(x_out, y_out, z_out,
2191                   (const felem_bytearray(*))secrets, num_points,
2192                   g_secret,
2193                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2194     } else
2195         /* do the multiplication without generator precomputation */
2196         batch_mul(x_out, y_out, z_out,
2197                   (const felem_bytearray(*))secrets, num_points,
2198                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2199     /* reduce the output to its unique minimal representation */
2200     felem_contract(x_in, x_out);
2201     felem_contract(y_in, y_out);
2202     felem_contract(z_in, z_out);
2203     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2204         (!smallfelem_to_BN(z, z_in))) {
2205         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2206         goto err;
2207     }
2208     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2209
2210  err:
2211     BN_CTX_end(ctx);
2212     EC_POINT_free(generator);
2213     OPENSSL_free(secrets);
2214     OPENSSL_free(pre_comp);
2215     OPENSSL_free(tmp_smallfelems);
2216     return ret;
2217 }
2218
2219 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2220 {
2221     int ret = 0;
2222     NISTP256_PRE_COMP *pre = NULL;
2223     int i, j;
2224     BIGNUM *x, *y;
2225     EC_POINT *generator = NULL;
2226     smallfelem tmp_smallfelems[32];
2227     felem x_tmp, y_tmp, z_tmp;
2228 #ifndef FIPS_MODE
2229     BN_CTX *new_ctx = NULL;
2230 #endif
2231
2232     /* throw away old precomputation */
2233     EC_pre_comp_free(group);
2234
2235 #ifndef FIPS_MODE
2236     if (ctx == NULL)
2237         new_ctx = BN_CTX_new();
2238 #endif
2239     if (ctx == NULL)
2240         return 0;
2241
2242     BN_CTX_start(ctx);
2243     x = BN_CTX_get(ctx);
2244     y = BN_CTX_get(ctx);
2245     if (y == NULL)
2246         goto err;
2247     /* get the generator */
2248     if (group->generator == NULL)
2249         goto err;
2250     generator = EC_POINT_new(group);
2251     if (generator == NULL)
2252         goto err;
2253     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2254     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2255     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2256         goto err;
2257     if ((pre = nistp256_pre_comp_new()) == NULL)
2258         goto err;
2259     /*
2260      * if the generator is the standard one, use built-in precomputation
2261      */
2262     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2263         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2264         goto done;
2265     }
2266     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2267         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2268         (!BN_to_felem(z_tmp, group->generator->Z)))
2269         goto err;
2270     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2271     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2272     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2273     /*
2274      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2275      * 2^160*G, 2^224*G for the second one
2276      */
2277     for (i = 1; i <= 8; i <<= 1) {
2278         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2279                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2280                            pre->g_pre_comp[0][i][1],
2281                            pre->g_pre_comp[0][i][2]);
2282         for (j = 0; j < 31; ++j) {
2283             point_double_small(pre->g_pre_comp[1][i][0],
2284                                pre->g_pre_comp[1][i][1],
2285                                pre->g_pre_comp[1][i][2],
2286                                pre->g_pre_comp[1][i][0],
2287                                pre->g_pre_comp[1][i][1],
2288                                pre->g_pre_comp[1][i][2]);
2289         }
2290         if (i == 8)
2291             break;
2292         point_double_small(pre->g_pre_comp[0][2 * i][0],
2293                            pre->g_pre_comp[0][2 * i][1],
2294                            pre->g_pre_comp[0][2 * i][2],
2295                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2296                            pre->g_pre_comp[1][i][2]);
2297         for (j = 0; j < 31; ++j) {
2298             point_double_small(pre->g_pre_comp[0][2 * i][0],
2299                                pre->g_pre_comp[0][2 * i][1],
2300                                pre->g_pre_comp[0][2 * i][2],
2301                                pre->g_pre_comp[0][2 * i][0],
2302                                pre->g_pre_comp[0][2 * i][1],
2303                                pre->g_pre_comp[0][2 * i][2]);
2304         }
2305     }
2306     for (i = 0; i < 2; i++) {
2307         /* g_pre_comp[i][0] is the point at infinity */
2308         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2309         /* the remaining multiples */
2310         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2311         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2312                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2313                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2314                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2315                         pre->g_pre_comp[i][2][2]);
2316         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2317         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2318                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2319                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2320                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2321                         pre->g_pre_comp[i][2][2]);
2322         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2323         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2324                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2325                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2326                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2327                         pre->g_pre_comp[i][4][2]);
2328         /*
2329          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2330          */
2331         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2332                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2333                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2334                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2335                         pre->g_pre_comp[i][2][2]);
2336         for (j = 1; j < 8; ++j) {
2337             /* odd multiples: add G resp. 2^32*G */
2338             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2339                             pre->g_pre_comp[i][2 * j + 1][1],
2340                             pre->g_pre_comp[i][2 * j + 1][2],
2341                             pre->g_pre_comp[i][2 * j][0],
2342                             pre->g_pre_comp[i][2 * j][1],
2343                             pre->g_pre_comp[i][2 * j][2],
2344                             pre->g_pre_comp[i][1][0],
2345                             pre->g_pre_comp[i][1][1],
2346                             pre->g_pre_comp[i][1][2]);
2347         }
2348     }
2349     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2350
2351  done:
2352     SETPRECOMP(group, nistp256, pre);
2353     pre = NULL;
2354     ret = 1;
2355
2356  err:
2357     BN_CTX_end(ctx);
2358     EC_POINT_free(generator);
2359 #ifndef FIPS_MODE
2360     BN_CTX_free(new_ctx);
2361 #endif
2362     EC_nistp256_pre_comp_free(pre);
2363     return ret;
2364 }
2365
2366 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2367 {
2368     return HAVEPRECOMP(group, nistp256);
2369 }
2370 #endif