crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # ifndef OPENSSL_SYS_VMS
  33 #  include <stdint.h>
  34 # else
  35 #  include <inttypes.h>
  36 # endif
  37
  38 # include <string.h>
  39 # include <openssl/err.h>
  40 # include "ec_lcl.h"
  41
  42 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  45                                  * platforms */
  46 # else
  47 #  error "Need GCC 3.1 or later to define type uint128_t"
  48 # endif
  49
  50 typedef uint8_t u8;
  51 typedef uint64_t u64;
  52 typedef int64_t s64;
  53
  54 /*
  55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  56  * element of this field into 66 bytes where the most significant byte
  57  * contains only a single bit. We call this an felem_bytearray.
  58  */
  59
  60 typedef u8 felem_bytearray[66];
  61
  62 /*
  63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  64  * These values are big-endian.
  65  */
  66 static const felem_bytearray nistp521_curve_params[5] = {
  67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff},
  76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xfc},
  85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  93      0x3f, 0x00},
  94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 102      0xbd, 0x66},
 103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 111      0x66, 0x50}
 112 };
 113
 114 /*-
 115  * The representation of field elements.
 116  * ------------------------------------
 117  *
 118  * We represent field elements with nine values. These values are either 64 or
 119  * 128 bits and the field element represented is:
 120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 122  * 58 bits apart, but are greater than 58 bits in length, the most significant
 123  * bits of each limb overlap with the least significant bits of the next.
 124  *
 125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 126  * 'largefelem' */
 127
 128 # define NLIMBS 9
 129
 130 typedef uint64_t limb;
 131 typedef limb felem[NLIMBS];
 132 typedef uint128_t largefelem[NLIMBS];
 133
 134 static const limb bottom57bits = 0x1ffffffffffffff;
 135 static const limb bottom58bits = 0x3ffffffffffffff;
 136
 137 /*
 138  * bin66_to_felem takes a little-endian byte array and converts it into felem
 139  * form. This assumes that the CPU is little-endian.
 140  */
 141 static void bin66_to_felem(felem out, const u8 in[66])
 142 {
 143     out[0] = (*((limb *) & in[0])) & bottom58bits;
 144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 147     out[4] = (*((limb *) & in[29])) & bottom58bits;
 148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 151     out[8] = (*((limb *) & in[58])) & bottom57bits;
 152 }
 153
 154 /*
 155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 156  * array. This assumes that the CPU is little-endian.
 157  */
 158 static void felem_to_bin66(u8 out[66], const felem in)
 159 {
 160     memset(out, 0, 66);
 161     (*((limb *) & out[0])) = in[0];
 162     (*((limb *) & out[7])) |= in[1] << 2;
 163     (*((limb *) & out[14])) |= in[2] << 4;
 164     (*((limb *) & out[21])) |= in[3] << 6;
 165     (*((limb *) & out[29])) = in[4];
 166     (*((limb *) & out[36])) |= in[5] << 2;
 167     (*((limb *) & out[43])) |= in[6] << 4;
 168     (*((limb *) & out[50])) |= in[7] << 6;
 169     (*((limb *) & out[58])) = in[8];
 170 }
 171
 172 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 173 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 174 {
 175     unsigned i;
 176     for (i = 0; i < len; ++i)
 177         out[i] = in[len - 1 - i];
 178 }
 179
 180 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 181 static int BN_to_felem(felem out, const BIGNUM *bn)
 182 {
 183     felem_bytearray b_in;
 184     felem_bytearray b_out;
 185     unsigned num_bytes;
 186
 187     /* BN_bn2bin eats leading zeroes */
 188     memset(b_out, 0, sizeof b_out);
 189     num_bytes = BN_num_bytes(bn);
 190     if (num_bytes > sizeof b_out) {
 191         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 192         return 0;
 193     }
 194     if (BN_is_negative(bn)) {
 195         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 196         return 0;
 197     }
 198     num_bytes = BN_bn2bin(bn, b_in);
 199     flip_endian(b_out, b_in, num_bytes);
 200     bin66_to_felem(out, b_out);
 201     return 1;
 202 }
 203
 204 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 205 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 206 {
 207     felem_bytearray b_in, b_out;
 208     felem_to_bin66(b_in, in);
 209     flip_endian(b_out, b_in, sizeof b_out);
 210     return BN_bin2bn(b_out, sizeof b_out, out);
 211 }
 212
 213 /*-
 214  * Field operations
 215  * ----------------
 216  */
 217
 218 static void felem_one(felem out)
 219 {
 220     out[0] = 1;
 221     out[1] = 0;
 222     out[2] = 0;
 223     out[3] = 0;
 224     out[4] = 0;
 225     out[5] = 0;
 226     out[6] = 0;
 227     out[7] = 0;
 228     out[8] = 0;
 229 }
 230
 231 static void felem_assign(felem out, const felem in)
 232 {
 233     out[0] = in[0];
 234     out[1] = in[1];
 235     out[2] = in[2];
 236     out[3] = in[3];
 237     out[4] = in[4];
 238     out[5] = in[5];
 239     out[6] = in[6];
 240     out[7] = in[7];
 241     out[8] = in[8];
 242 }
 243
 244 /* felem_sum64 sets out = out + in. */
 245 static void felem_sum64(felem out, const felem in)
 246 {
 247     out[0] += in[0];
 248     out[1] += in[1];
 249     out[2] += in[2];
 250     out[3] += in[3];
 251     out[4] += in[4];
 252     out[5] += in[5];
 253     out[6] += in[6];
 254     out[7] += in[7];
 255     out[8] += in[8];
 256 }
 257
 258 /* felem_scalar sets out = in * scalar */
 259 static void felem_scalar(felem out, const felem in, limb scalar)
 260 {
 261     out[0] = in[0] * scalar;
 262     out[1] = in[1] * scalar;
 263     out[2] = in[2] * scalar;
 264     out[3] = in[3] * scalar;
 265     out[4] = in[4] * scalar;
 266     out[5] = in[5] * scalar;
 267     out[6] = in[6] * scalar;
 268     out[7] = in[7] * scalar;
 269     out[8] = in[8] * scalar;
 270 }
 271
 272 /* felem_scalar64 sets out = out * scalar */
 273 static void felem_scalar64(felem out, limb scalar)
 274 {
 275     out[0] *= scalar;
 276     out[1] *= scalar;
 277     out[2] *= scalar;
 278     out[3] *= scalar;
 279     out[4] *= scalar;
 280     out[5] *= scalar;
 281     out[6] *= scalar;
 282     out[7] *= scalar;
 283     out[8] *= scalar;
 284 }
 285
 286 /* felem_scalar128 sets out = out * scalar */
 287 static void felem_scalar128(largefelem out, limb scalar)
 288 {
 289     out[0] *= scalar;
 290     out[1] *= scalar;
 291     out[2] *= scalar;
 292     out[3] *= scalar;
 293     out[4] *= scalar;
 294     out[5] *= scalar;
 295     out[6] *= scalar;
 296     out[7] *= scalar;
 297     out[8] *= scalar;
 298 }
 299
 300 /*-
 301  * felem_neg sets |out| to |-in|
 302  * On entry:
 303  *   in[i] < 2^59 + 2^14
 304  * On exit:
 305  *   out[i] < 2^62
 306  */
 307 static void felem_neg(felem out, const felem in)
 308 {
 309     /* In order to prevent underflow, we subtract from 0 mod p. */
 310     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 311     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 312
 313     out[0] = two62m3 - in[0];
 314     out[1] = two62m2 - in[1];
 315     out[2] = two62m2 - in[2];
 316     out[3] = two62m2 - in[3];
 317     out[4] = two62m2 - in[4];
 318     out[5] = two62m2 - in[5];
 319     out[6] = two62m2 - in[6];
 320     out[7] = two62m2 - in[7];
 321     out[8] = two62m2 - in[8];
 322 }
 323
 324 /*-
 325  * felem_diff64 subtracts |in| from |out|
 326  * On entry:
 327  *   in[i] < 2^59 + 2^14
 328  * On exit:
 329  *   out[i] < out[i] + 2^62
 330  */
 331 static void felem_diff64(felem out, const felem in)
 332 {
 333     /*
 334      * In order to prevent underflow, we add 0 mod p before subtracting.
 335      */
 336     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 337     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 338
 339     out[0] += two62m3 - in[0];
 340     out[1] += two62m2 - in[1];
 341     out[2] += two62m2 - in[2];
 342     out[3] += two62m2 - in[3];
 343     out[4] += two62m2 - in[4];
 344     out[5] += two62m2 - in[5];
 345     out[6] += two62m2 - in[6];
 346     out[7] += two62m2 - in[7];
 347     out[8] += two62m2 - in[8];
 348 }
 349
 350 /*-
 351  * felem_diff_128_64 subtracts |in| from |out|
 352  * On entry:
 353  *   in[i] < 2^62 + 2^17
 354  * On exit:
 355  *   out[i] < out[i] + 2^63
 356  */
 357 static void felem_diff_128_64(largefelem out, const felem in)
 358 {
 359     /*
 360      * In order to prevent underflow, we add 0 mod p before subtracting.
 361      */
 362     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 363     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 364
 365     out[0] += two63m6 - in[0];
 366     out[1] += two63m5 - in[1];
 367     out[2] += two63m5 - in[2];
 368     out[3] += two63m5 - in[3];
 369     out[4] += two63m5 - in[4];
 370     out[5] += two63m5 - in[5];
 371     out[6] += two63m5 - in[6];
 372     out[7] += two63m5 - in[7];
 373     out[8] += two63m5 - in[8];
 374 }
 375
 376 /*-
 377  * felem_diff_128_64 subtracts |in| from |out|
 378  * On entry:
 379  *   in[i] < 2^126
 380  * On exit:
 381  *   out[i] < out[i] + 2^127 - 2^69
 382  */
 383 static void felem_diff128(largefelem out, const largefelem in)
 384 {
 385     /*
 386      * In order to prevent underflow, we add 0 mod p before subtracting.
 387      */
 388     static const uint128_t two127m70 =
 389         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 390     static const uint128_t two127m69 =
 391         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 392
 393     out[0] += (two127m70 - in[0]);
 394     out[1] += (two127m69 - in[1]);
 395     out[2] += (two127m69 - in[2]);
 396     out[3] += (two127m69 - in[3]);
 397     out[4] += (two127m69 - in[4]);
 398     out[5] += (two127m69 - in[5]);
 399     out[6] += (two127m69 - in[6]);
 400     out[7] += (two127m69 - in[7]);
 401     out[8] += (two127m69 - in[8]);
 402 }
 403
 404 /*-
 405  * felem_square sets |out| = |in|^2
 406  * On entry:
 407  *   in[i] < 2^62
 408  * On exit:
 409  *   out[i] < 17 * max(in[i]) * max(in[i])
 410  */
 411 static void felem_square(largefelem out, const felem in)
 412 {
 413     felem inx2, inx4;
 414     felem_scalar(inx2, in, 2);
 415     felem_scalar(inx4, in, 4);
 416
 417     /*-
 418      * We have many cases were we want to do
 419      *   in[x] * in[y] +
 420      *   in[y] * in[x]
 421      * This is obviously just
 422      *   2 * in[x] * in[y]
 423      * However, rather than do the doubling on the 128 bit result, we
 424      * double one of the inputs to the multiplication by reading from
 425      * |inx2|
 426      */
 427
 428     out[0] = ((uint128_t) in[0]) * in[0];
 429     out[1] = ((uint128_t) in[0]) * inx2[1];
 430     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 431     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 432     out[4] = ((uint128_t) in[0]) * inx2[4] +
 433              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 434     out[5] = ((uint128_t) in[0]) * inx2[5] +
 435              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 436     out[6] = ((uint128_t) in[0]) * inx2[6] +
 437              ((uint128_t) in[1]) * inx2[5] +
 438              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 439     out[7] = ((uint128_t) in[0]) * inx2[7] +
 440              ((uint128_t) in[1]) * inx2[6] +
 441              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 442     out[8] = ((uint128_t) in[0]) * inx2[8] +
 443              ((uint128_t) in[1]) * inx2[7] +
 444              ((uint128_t) in[2]) * inx2[6] +
 445              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 446
 447     /*
 448      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 449      * They correspond to locations one bit up from the limbs produced above
 450      * so we would have to multiply by two to align them. Again, rather than
 451      * operate on the 128-bit result, we double one of the inputs to the
 452      * multiplication. If we want to double for both this reason, and the
 453      * reason above, then we end up multiplying by four.
 454      */
 455
 456     /* 9 */
 457     out[0] += ((uint128_t) in[1]) * inx4[8] +
 458               ((uint128_t) in[2]) * inx4[7] +
 459               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 460
 461     /* 10 */
 462     out[1] += ((uint128_t) in[2]) * inx4[8] +
 463               ((uint128_t) in[3]) * inx4[7] +
 464               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 465
 466     /* 11 */
 467     out[2] += ((uint128_t) in[3]) * inx4[8] +
 468               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 469
 470     /* 12 */
 471     out[3] += ((uint128_t) in[4]) * inx4[8] +
 472               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 473
 474     /* 13 */
 475     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 476
 477     /* 14 */
 478     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 479
 480     /* 15 */
 481     out[6] += ((uint128_t) in[7]) * inx4[8];
 482
 483     /* 16 */
 484     out[7] += ((uint128_t) in[8]) * inx2[8];
 485 }
 486
 487 /*-
 488  * felem_mul sets |out| = |in1| * |in2|
 489  * On entry:
 490  *   in1[i] < 2^64
 491  *   in2[i] < 2^63
 492  * On exit:
 493  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 494  */
 495 static void felem_mul(largefelem out, const felem in1, const felem in2)
 496 {
 497     felem in2x2;
 498     felem_scalar(in2x2, in2, 2);
 499
 500     out[0] = ((uint128_t) in1[0]) * in2[0];
 501
 502     out[1] = ((uint128_t) in1[0]) * in2[1] +
 503              ((uint128_t) in1[1]) * in2[0];
 504
 505     out[2] = ((uint128_t) in1[0]) * in2[2] +
 506              ((uint128_t) in1[1]) * in2[1] +
 507              ((uint128_t) in1[2]) * in2[0];
 508
 509     out[3] = ((uint128_t) in1[0]) * in2[3] +
 510              ((uint128_t) in1[1]) * in2[2] +
 511              ((uint128_t) in1[2]) * in2[1] +
 512              ((uint128_t) in1[3]) * in2[0];
 513
 514     out[4] = ((uint128_t) in1[0]) * in2[4] +
 515              ((uint128_t) in1[1]) * in2[3] +
 516              ((uint128_t) in1[2]) * in2[2] +
 517              ((uint128_t) in1[3]) * in2[1] +
 518              ((uint128_t) in1[4]) * in2[0];
 519
 520     out[5] = ((uint128_t) in1[0]) * in2[5] +
 521              ((uint128_t) in1[1]) * in2[4] +
 522              ((uint128_t) in1[2]) * in2[3] +
 523              ((uint128_t) in1[3]) * in2[2] +
 524              ((uint128_t) in1[4]) * in2[1] +
 525              ((uint128_t) in1[5]) * in2[0];
 526
 527     out[6] = ((uint128_t) in1[0]) * in2[6] +
 528              ((uint128_t) in1[1]) * in2[5] +
 529              ((uint128_t) in1[2]) * in2[4] +
 530              ((uint128_t) in1[3]) * in2[3] +
 531              ((uint128_t) in1[4]) * in2[2] +
 532              ((uint128_t) in1[5]) * in2[1] +
 533              ((uint128_t) in1[6]) * in2[0];
 534
 535     out[7] = ((uint128_t) in1[0]) * in2[7] +
 536              ((uint128_t) in1[1]) * in2[6] +
 537              ((uint128_t) in1[2]) * in2[5] +
 538              ((uint128_t) in1[3]) * in2[4] +
 539              ((uint128_t) in1[4]) * in2[3] +
 540              ((uint128_t) in1[5]) * in2[2] +
 541              ((uint128_t) in1[6]) * in2[1] +
 542              ((uint128_t) in1[7]) * in2[0];
 543
 544     out[8] = ((uint128_t) in1[0]) * in2[8] +
 545              ((uint128_t) in1[1]) * in2[7] +
 546              ((uint128_t) in1[2]) * in2[6] +
 547              ((uint128_t) in1[3]) * in2[5] +
 548              ((uint128_t) in1[4]) * in2[4] +
 549              ((uint128_t) in1[5]) * in2[3] +
 550              ((uint128_t) in1[6]) * in2[2] +
 551              ((uint128_t) in1[7]) * in2[1] +
 552              ((uint128_t) in1[8]) * in2[0];
 553
 554     /* See comment in felem_square about the use of in2x2 here */
 555
 556     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 557               ((uint128_t) in1[2]) * in2x2[7] +
 558               ((uint128_t) in1[3]) * in2x2[6] +
 559               ((uint128_t) in1[4]) * in2x2[5] +
 560               ((uint128_t) in1[5]) * in2x2[4] +
 561               ((uint128_t) in1[6]) * in2x2[3] +
 562               ((uint128_t) in1[7]) * in2x2[2] +
 563               ((uint128_t) in1[8]) * in2x2[1];
 564
 565     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 566               ((uint128_t) in1[3]) * in2x2[7] +
 567               ((uint128_t) in1[4]) * in2x2[6] +
 568               ((uint128_t) in1[5]) * in2x2[5] +
 569               ((uint128_t) in1[6]) * in2x2[4] +
 570               ((uint128_t) in1[7]) * in2x2[3] +
 571               ((uint128_t) in1[8]) * in2x2[2];
 572
 573     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 574               ((uint128_t) in1[4]) * in2x2[7] +
 575               ((uint128_t) in1[5]) * in2x2[6] +
 576               ((uint128_t) in1[6]) * in2x2[5] +
 577               ((uint128_t) in1[7]) * in2x2[4] +
 578               ((uint128_t) in1[8]) * in2x2[3];
 579
 580     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 581               ((uint128_t) in1[5]) * in2x2[7] +
 582               ((uint128_t) in1[6]) * in2x2[6] +
 583               ((uint128_t) in1[7]) * in2x2[5] +
 584               ((uint128_t) in1[8]) * in2x2[4];
 585
 586     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 587               ((uint128_t) in1[6]) * in2x2[7] +
 588               ((uint128_t) in1[7]) * in2x2[6] +
 589               ((uint128_t) in1[8]) * in2x2[5];
 590
 591     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 592               ((uint128_t) in1[7]) * in2x2[7] +
 593               ((uint128_t) in1[8]) * in2x2[6];
 594
 595     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 596               ((uint128_t) in1[8]) * in2x2[7];
 597
 598     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 599 }
 600
 601 static const limb bottom52bits = 0xfffffffffffff;
 602
 603 /*-
 604  * felem_reduce converts a largefelem to an felem.
 605  * On entry:
 606  *   in[i] < 2^128
 607  * On exit:
 608  *   out[i] < 2^59 + 2^14
 609  */
 610 static void felem_reduce(felem out, const largefelem in)
 611 {
 612     u64 overflow1, overflow2;
 613
 614     out[0] = ((limb) in[0]) & bottom58bits;
 615     out[1] = ((limb) in[1]) & bottom58bits;
 616     out[2] = ((limb) in[2]) & bottom58bits;
 617     out[3] = ((limb) in[3]) & bottom58bits;
 618     out[4] = ((limb) in[4]) & bottom58bits;
 619     out[5] = ((limb) in[5]) & bottom58bits;
 620     out[6] = ((limb) in[6]) & bottom58bits;
 621     out[7] = ((limb) in[7]) & bottom58bits;
 622     out[8] = ((limb) in[8]) & bottom58bits;
 623
 624     /* out[i] < 2^58 */
 625
 626     out[1] += ((limb) in[0]) >> 58;
 627     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 628     /*-
 629      * out[1] < 2^58 + 2^6 + 2^58
 630      *        = 2^59 + 2^6
 631      */
 632     out[2] += ((limb) (in[0] >> 64)) >> 52;
 633
 634     out[2] += ((limb) in[1]) >> 58;
 635     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 636     out[3] += ((limb) (in[1] >> 64)) >> 52;
 637
 638     out[3] += ((limb) in[2]) >> 58;
 639     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 640     out[4] += ((limb) (in[2] >> 64)) >> 52;
 641
 642     out[4] += ((limb) in[3]) >> 58;
 643     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 644     out[5] += ((limb) (in[3] >> 64)) >> 52;
 645
 646     out[5] += ((limb) in[4]) >> 58;
 647     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 648     out[6] += ((limb) (in[4] >> 64)) >> 52;
 649
 650     out[6] += ((limb) in[5]) >> 58;
 651     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 652     out[7] += ((limb) (in[5] >> 64)) >> 52;
 653
 654     out[7] += ((limb) in[6]) >> 58;
 655     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 656     out[8] += ((limb) (in[6] >> 64)) >> 52;
 657
 658     out[8] += ((limb) in[7]) >> 58;
 659     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 660     /*-
 661      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 662      *            < 2^59 + 2^13
 663      */
 664     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 665
 666     overflow1 += ((limb) in[8]) >> 58;
 667     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 668     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 669
 670     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 671     overflow2 <<= 1;            /* overflow2 < 2^13 */
 672
 673     out[0] += overflow1;        /* out[0] < 2^60 */
 674     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 675
 676     out[1] += out[0] >> 58;
 677     out[0] &= bottom58bits;
 678     /*-
 679      * out[0] < 2^58
 680      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 681      *        < 2^59 + 2^14
 682      */
 683 }
 684
 685 static void felem_square_reduce(felem out, const felem in)
 686 {
 687     largefelem tmp;
 688     felem_square(tmp, in);
 689     felem_reduce(out, tmp);
 690 }
 691
 692 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 693 {
 694     largefelem tmp;
 695     felem_mul(tmp, in1, in2);
 696     felem_reduce(out, tmp);
 697 }
 698
 699 /*-
 700  * felem_inv calculates |out| = |in|^{-1}
 701  *
 702  * Based on Fermat's Little Theorem:
 703  *   a^p = a (mod p)
 704  *   a^{p-1} = 1 (mod p)
 705  *   a^{p-2} = a^{-1} (mod p)
 706  */
 707 static void felem_inv(felem out, const felem in)
 708 {
 709     felem ftmp, ftmp2, ftmp3, ftmp4;
 710     largefelem tmp;
 711     unsigned i;
 712
 713     felem_square(tmp, in);
 714     felem_reduce(ftmp, tmp);    /* 2^1 */
 715     felem_mul(tmp, in, ftmp);
 716     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 717     felem_assign(ftmp2, ftmp);
 718     felem_square(tmp, ftmp);
 719     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 720     felem_mul(tmp, in, ftmp);
 721     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 722     felem_square(tmp, ftmp);
 723     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 724
 725     felem_square(tmp, ftmp2);
 726     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 727     felem_square(tmp, ftmp3);
 728     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 729     felem_mul(tmp, ftmp3, ftmp2);
 730     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 731
 732     felem_assign(ftmp2, ftmp3);
 733     felem_square(tmp, ftmp3);
 734     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 735     felem_square(tmp, ftmp3);
 736     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 737     felem_square(tmp, ftmp3);
 738     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 739     felem_square(tmp, ftmp3);
 740     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 741     felem_assign(ftmp4, ftmp3);
 742     felem_mul(tmp, ftmp3, ftmp);
 743     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 744     felem_square(tmp, ftmp4);
 745     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 746     felem_mul(tmp, ftmp3, ftmp2);
 747     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 748     felem_assign(ftmp2, ftmp3);
 749
 750     for (i = 0; i < 8; i++) {
 751         felem_square(tmp, ftmp3);
 752         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 753     }
 754     felem_mul(tmp, ftmp3, ftmp2);
 755     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 756     felem_assign(ftmp2, ftmp3);
 757
 758     for (i = 0; i < 16; i++) {
 759         felem_square(tmp, ftmp3);
 760         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 761     }
 762     felem_mul(tmp, ftmp3, ftmp2);
 763     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 764     felem_assign(ftmp2, ftmp3);
 765
 766     for (i = 0; i < 32; i++) {
 767         felem_square(tmp, ftmp3);
 768         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 769     }
 770     felem_mul(tmp, ftmp3, ftmp2);
 771     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 772     felem_assign(ftmp2, ftmp3);
 773
 774     for (i = 0; i < 64; i++) {
 775         felem_square(tmp, ftmp3);
 776         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 777     }
 778     felem_mul(tmp, ftmp3, ftmp2);
 779     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 780     felem_assign(ftmp2, ftmp3);
 781
 782     for (i = 0; i < 128; i++) {
 783         felem_square(tmp, ftmp3);
 784         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 785     }
 786     felem_mul(tmp, ftmp3, ftmp2);
 787     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 788     felem_assign(ftmp2, ftmp3);
 789
 790     for (i = 0; i < 256; i++) {
 791         felem_square(tmp, ftmp3);
 792         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 793     }
 794     felem_mul(tmp, ftmp3, ftmp2);
 795     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 796
 797     for (i = 0; i < 9; i++) {
 798         felem_square(tmp, ftmp3);
 799         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 800     }
 801     felem_mul(tmp, ftmp3, ftmp4);
 802     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 803     felem_mul(tmp, ftmp3, in);
 804     felem_reduce(out, tmp);     /* 2^512 - 3 */
 805 }
 806
 807 /* This is 2^521-1, expressed as an felem */
 808 static const felem kPrime = {
 809     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 810     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 811     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 812 };
 813
 814 /*-
 815  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 816  * otherwise.
 817  * On entry:
 818  *   in[i] < 2^59 + 2^14
 819  */
 820 static limb felem_is_zero(const felem in)
 821 {
 822     felem ftmp;
 823     limb is_zero, is_p;
 824     felem_assign(ftmp, in);
 825
 826     ftmp[0] += ftmp[8] >> 57;
 827     ftmp[8] &= bottom57bits;
 828     /* ftmp[8] < 2^57 */
 829     ftmp[1] += ftmp[0] >> 58;
 830     ftmp[0] &= bottom58bits;
 831     ftmp[2] += ftmp[1] >> 58;
 832     ftmp[1] &= bottom58bits;
 833     ftmp[3] += ftmp[2] >> 58;
 834     ftmp[2] &= bottom58bits;
 835     ftmp[4] += ftmp[3] >> 58;
 836     ftmp[3] &= bottom58bits;
 837     ftmp[5] += ftmp[4] >> 58;
 838     ftmp[4] &= bottom58bits;
 839     ftmp[6] += ftmp[5] >> 58;
 840     ftmp[5] &= bottom58bits;
 841     ftmp[7] += ftmp[6] >> 58;
 842     ftmp[6] &= bottom58bits;
 843     ftmp[8] += ftmp[7] >> 58;
 844     ftmp[7] &= bottom58bits;
 845     /* ftmp[8] < 2^57 + 4 */
 846
 847     /*
 848      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 849      * than our bound for ftmp[8]. Therefore we only have to check if the
 850      * zero is zero or 2^521-1.
 851      */
 852
 853     is_zero = 0;
 854     is_zero |= ftmp[0];
 855     is_zero |= ftmp[1];
 856     is_zero |= ftmp[2];
 857     is_zero |= ftmp[3];
 858     is_zero |= ftmp[4];
 859     is_zero |= ftmp[5];
 860     is_zero |= ftmp[6];
 861     is_zero |= ftmp[7];
 862     is_zero |= ftmp[8];
 863
 864     is_zero--;
 865     /*
 866      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 867      * can be set is if is_zero was 0 before the decrement.
 868      */
 869     is_zero = ((s64) is_zero) >> 63;
 870
 871     is_p = ftmp[0] ^ kPrime[0];
 872     is_p |= ftmp[1] ^ kPrime[1];
 873     is_p |= ftmp[2] ^ kPrime[2];
 874     is_p |= ftmp[3] ^ kPrime[3];
 875     is_p |= ftmp[4] ^ kPrime[4];
 876     is_p |= ftmp[5] ^ kPrime[5];
 877     is_p |= ftmp[6] ^ kPrime[6];
 878     is_p |= ftmp[7] ^ kPrime[7];
 879     is_p |= ftmp[8] ^ kPrime[8];
 880
 881     is_p--;
 882     is_p = ((s64) is_p) >> 63;
 883
 884     is_zero |= is_p;
 885     return is_zero;
 886 }
 887
 888 static int felem_is_zero_int(const felem in)
 889 {
 890     return (int)(felem_is_zero(in) & ((limb) 1));
 891 }
 892
 893 /*-
 894  * felem_contract converts |in| to its unique, minimal representation.
 895  * On entry:
 896  *   in[i] < 2^59 + 2^14
 897  */
 898 static void felem_contract(felem out, const felem in)
 899 {
 900     limb is_p, is_greater, sign;
 901     static const limb two58 = ((limb) 1) << 58;
 902
 903     felem_assign(out, in);
 904
 905     out[0] += out[8] >> 57;
 906     out[8] &= bottom57bits;
 907     /* out[8] < 2^57 */
 908     out[1] += out[0] >> 58;
 909     out[0] &= bottom58bits;
 910     out[2] += out[1] >> 58;
 911     out[1] &= bottom58bits;
 912     out[3] += out[2] >> 58;
 913     out[2] &= bottom58bits;
 914     out[4] += out[3] >> 58;
 915     out[3] &= bottom58bits;
 916     out[5] += out[4] >> 58;
 917     out[4] &= bottom58bits;
 918     out[6] += out[5] >> 58;
 919     out[5] &= bottom58bits;
 920     out[7] += out[6] >> 58;
 921     out[6] &= bottom58bits;
 922     out[8] += out[7] >> 58;
 923     out[7] &= bottom58bits;
 924     /* out[8] < 2^57 + 4 */
 925
 926     /*
 927      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 928      * out. See the comments in felem_is_zero regarding why we don't test for
 929      * other multiples of the prime.
 930      */
 931
 932     /*
 933      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 934      */
 935
 936     is_p = out[0] ^ kPrime[0];
 937     is_p |= out[1] ^ kPrime[1];
 938     is_p |= out[2] ^ kPrime[2];
 939     is_p |= out[3] ^ kPrime[3];
 940     is_p |= out[4] ^ kPrime[4];
 941     is_p |= out[5] ^ kPrime[5];
 942     is_p |= out[6] ^ kPrime[6];
 943     is_p |= out[7] ^ kPrime[7];
 944     is_p |= out[8] ^ kPrime[8];
 945
 946     is_p--;
 947     is_p &= is_p << 32;
 948     is_p &= is_p << 16;
 949     is_p &= is_p << 8;
 950     is_p &= is_p << 4;
 951     is_p &= is_p << 2;
 952     is_p &= is_p << 1;
 953     is_p = ((s64) is_p) >> 63;
 954     is_p = ~is_p;
 955
 956     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 957
 958     out[0] &= is_p;
 959     out[1] &= is_p;
 960     out[2] &= is_p;
 961     out[3] &= is_p;
 962     out[4] &= is_p;
 963     out[5] &= is_p;
 964     out[6] &= is_p;
 965     out[7] &= is_p;
 966     out[8] &= is_p;
 967
 968     /*
 969      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 970      * 57 is greater than zero as (2^521-1) + x >= 2^522
 971      */
 972     is_greater = out[8] >> 57;
 973     is_greater |= is_greater << 32;
 974     is_greater |= is_greater << 16;
 975     is_greater |= is_greater << 8;
 976     is_greater |= is_greater << 4;
 977     is_greater |= is_greater << 2;
 978     is_greater |= is_greater << 1;
 979     is_greater = ((s64) is_greater) >> 63;
 980
 981     out[0] -= kPrime[0] & is_greater;
 982     out[1] -= kPrime[1] & is_greater;
 983     out[2] -= kPrime[2] & is_greater;
 984     out[3] -= kPrime[3] & is_greater;
 985     out[4] -= kPrime[4] & is_greater;
 986     out[5] -= kPrime[5] & is_greater;
 987     out[6] -= kPrime[6] & is_greater;
 988     out[7] -= kPrime[7] & is_greater;
 989     out[8] -= kPrime[8] & is_greater;
 990
 991     /* Eliminate negative coefficients */
 992     sign = -(out[0] >> 63);
 993     out[0] += (two58 & sign);
 994     out[1] -= (1 & sign);
 995     sign = -(out[1] >> 63);
 996     out[1] += (two58 & sign);
 997     out[2] -= (1 & sign);
 998     sign = -(out[2] >> 63);
 999     out[2] += (two58 & sign);
1000     out[3] -= (1 & sign);
1001     sign = -(out[3] >> 63);
1002     out[3] += (two58 & sign);
1003     out[4] -= (1 & sign);
1004     sign = -(out[4] >> 63);
1005     out[4] += (two58 & sign);
1006     out[5] -= (1 & sign);
1007     sign = -(out[0] >> 63);
1008     out[5] += (two58 & sign);
1009     out[6] -= (1 & sign);
1010     sign = -(out[6] >> 63);
1011     out[6] += (two58 & sign);
1012     out[7] -= (1 & sign);
1013     sign = -(out[7] >> 63);
1014     out[7] += (two58 & sign);
1015     out[8] -= (1 & sign);
1016     sign = -(out[5] >> 63);
1017     out[5] += (two58 & sign);
1018     out[6] -= (1 & sign);
1019     sign = -(out[6] >> 63);
1020     out[6] += (two58 & sign);
1021     out[7] -= (1 & sign);
1022     sign = -(out[7] >> 63);
1023     out[7] += (two58 & sign);
1024     out[8] -= (1 & sign);
1025 }
1026
1027 /*-
1028  * Group operations
1029  * ----------------
1030  *
1031  * Building on top of the field operations we have the operations on the
1032  * elliptic curve group itself. Points on the curve are represented in Jacobian
1033  * coordinates */
1034
1035 /*-
1036  * point_double calcuates 2*(x_in, y_in, z_in)
1037  *
1038  * The method is taken from:
1039  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1040  *
1041  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1042  * while x_out == y_in is not (maybe this works, but it's not tested). */
1043 static void
1044 point_double(felem x_out, felem y_out, felem z_out,
1045              const felem x_in, const felem y_in, const felem z_in)
1046 {
1047     largefelem tmp, tmp2;
1048     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1049
1050     felem_assign(ftmp, x_in);
1051     felem_assign(ftmp2, x_in);
1052
1053     /* delta = z^2 */
1054     felem_square(tmp, z_in);
1055     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1056
1057     /* gamma = y^2 */
1058     felem_square(tmp, y_in);
1059     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1060
1061     /* beta = x*gamma */
1062     felem_mul(tmp, x_in, gamma);
1063     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1064
1065     /* alpha = 3*(x-delta)*(x+delta) */
1066     felem_diff64(ftmp, delta);
1067     /* ftmp[i] < 2^61 */
1068     felem_sum64(ftmp2, delta);
1069     /* ftmp2[i] < 2^60 + 2^15 */
1070     felem_scalar64(ftmp2, 3);
1071     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1072     felem_mul(tmp, ftmp, ftmp2);
1073     /*-
1074      * tmp[i] < 17(3*2^121 + 3*2^76)
1075      *        = 61*2^121 + 61*2^76
1076      *        < 64*2^121 + 64*2^76
1077      *        = 2^127 + 2^82
1078      *        < 2^128
1079      */
1080     felem_reduce(alpha, tmp);
1081
1082     /* x' = alpha^2 - 8*beta */
1083     felem_square(tmp, alpha);
1084     /*
1085      * tmp[i] < 17*2^120 < 2^125
1086      */
1087     felem_assign(ftmp, beta);
1088     felem_scalar64(ftmp, 8);
1089     /* ftmp[i] < 2^62 + 2^17 */
1090     felem_diff_128_64(tmp, ftmp);
1091     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1092     felem_reduce(x_out, tmp);
1093
1094     /* z' = (y + z)^2 - gamma - delta */
1095     felem_sum64(delta, gamma);
1096     /* delta[i] < 2^60 + 2^15 */
1097     felem_assign(ftmp, y_in);
1098     felem_sum64(ftmp, z_in);
1099     /* ftmp[i] < 2^60 + 2^15 */
1100     felem_square(tmp, ftmp);
1101     /*
1102      * tmp[i] < 17(2^122) < 2^127
1103      */
1104     felem_diff_128_64(tmp, delta);
1105     /* tmp[i] < 2^127 + 2^63 */
1106     felem_reduce(z_out, tmp);
1107
1108     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1109     felem_scalar64(beta, 4);
1110     /* beta[i] < 2^61 + 2^16 */
1111     felem_diff64(beta, x_out);
1112     /* beta[i] < 2^61 + 2^60 + 2^16 */
1113     felem_mul(tmp, alpha, beta);
1114     /*-
1115      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1116      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1117      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1118      *        < 2^128
1119      */
1120     felem_square(tmp2, gamma);
1121     /*-
1122      * tmp2[i] < 17*(2^59 + 2^14)^2
1123      *         = 17*(2^118 + 2^74 + 2^28)
1124      */
1125     felem_scalar128(tmp2, 8);
1126     /*-
1127      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1128      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1129      *         < 2^126
1130      */
1131     felem_diff128(tmp, tmp2);
1132     /*-
1133      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1134      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1135      *          2^74 + 2^69 + 2^34 + 2^30
1136      *        < 2^128
1137      */
1138     felem_reduce(y_out, tmp);
1139 }
1140
1141 /* copy_conditional copies in to out iff mask is all ones. */
1142 static void copy_conditional(felem out, const felem in, limb mask)
1143 {
1144     unsigned i;
1145     for (i = 0; i < NLIMBS; ++i) {
1146         const limb tmp = mask & (in[i] ^ out[i]);
1147         out[i] ^= tmp;
1148     }
1149 }
1150
1151 /*-
1152  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1153  *
1154  * The method is taken from
1155  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1156  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1157  *
1158  * This function includes a branch for checking whether the two input points
1159  * are equal (while not equal to the point at infinity). This case never
1160  * happens during single point multiplication, so there is no timing leak for
1161  * ECDH or ECDSA signing. */
1162 static void point_add(felem x3, felem y3, felem z3,
1163                       const felem x1, const felem y1, const felem z1,
1164                       const int mixed, const felem x2, const felem y2,
1165                       const felem z2)
1166 {
1167     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1168     largefelem tmp, tmp2;
1169     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1170
1171     z1_is_zero = felem_is_zero(z1);
1172     z2_is_zero = felem_is_zero(z2);
1173
1174     /* ftmp = z1z1 = z1**2 */
1175     felem_square(tmp, z1);
1176     felem_reduce(ftmp, tmp);
1177
1178     if (!mixed) {
1179         /* ftmp2 = z2z2 = z2**2 */
1180         felem_square(tmp, z2);
1181         felem_reduce(ftmp2, tmp);
1182
1183         /* u1 = ftmp3 = x1*z2z2 */
1184         felem_mul(tmp, x1, ftmp2);
1185         felem_reduce(ftmp3, tmp);
1186
1187         /* ftmp5 = z1 + z2 */
1188         felem_assign(ftmp5, z1);
1189         felem_sum64(ftmp5, z2);
1190         /* ftmp5[i] < 2^61 */
1191
1192         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1193         felem_square(tmp, ftmp5);
1194         /* tmp[i] < 17*2^122 */
1195         felem_diff_128_64(tmp, ftmp);
1196         /* tmp[i] < 17*2^122 + 2^63 */
1197         felem_diff_128_64(tmp, ftmp2);
1198         /* tmp[i] < 17*2^122 + 2^64 */
1199         felem_reduce(ftmp5, tmp);
1200
1201         /* ftmp2 = z2 * z2z2 */
1202         felem_mul(tmp, ftmp2, z2);
1203         felem_reduce(ftmp2, tmp);
1204
1205         /* s1 = ftmp6 = y1 * z2**3 */
1206         felem_mul(tmp, y1, ftmp2);
1207         felem_reduce(ftmp6, tmp);
1208     } else {
1209         /*
1210          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1211          */
1212
1213         /* u1 = ftmp3 = x1*z2z2 */
1214         felem_assign(ftmp3, x1);
1215
1216         /* ftmp5 = 2*z1z2 */
1217         felem_scalar(ftmp5, z1, 2);
1218
1219         /* s1 = ftmp6 = y1 * z2**3 */
1220         felem_assign(ftmp6, y1);
1221     }
1222
1223     /* u2 = x2*z1z1 */
1224     felem_mul(tmp, x2, ftmp);
1225     /* tmp[i] < 17*2^120 */
1226
1227     /* h = ftmp4 = u2 - u1 */
1228     felem_diff_128_64(tmp, ftmp3);
1229     /* tmp[i] < 17*2^120 + 2^63 */
1230     felem_reduce(ftmp4, tmp);
1231
1232     x_equal = felem_is_zero(ftmp4);
1233
1234     /* z_out = ftmp5 * h */
1235     felem_mul(tmp, ftmp5, ftmp4);
1236     felem_reduce(z_out, tmp);
1237
1238     /* ftmp = z1 * z1z1 */
1239     felem_mul(tmp, ftmp, z1);
1240     felem_reduce(ftmp, tmp);
1241
1242     /* s2 = tmp = y2 * z1**3 */
1243     felem_mul(tmp, y2, ftmp);
1244     /* tmp[i] < 17*2^120 */
1245
1246     /* r = ftmp5 = (s2 - s1)*2 */
1247     felem_diff_128_64(tmp, ftmp6);
1248     /* tmp[i] < 17*2^120 + 2^63 */
1249     felem_reduce(ftmp5, tmp);
1250     y_equal = felem_is_zero(ftmp5);
1251     felem_scalar64(ftmp5, 2);
1252     /* ftmp5[i] < 2^61 */
1253
1254     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1255         point_double(x3, y3, z3, x1, y1, z1);
1256         return;
1257     }
1258
1259     /* I = ftmp = (2h)**2 */
1260     felem_assign(ftmp, ftmp4);
1261     felem_scalar64(ftmp, 2);
1262     /* ftmp[i] < 2^61 */
1263     felem_square(tmp, ftmp);
1264     /* tmp[i] < 17*2^122 */
1265     felem_reduce(ftmp, tmp);
1266
1267     /* J = ftmp2 = h * I */
1268     felem_mul(tmp, ftmp4, ftmp);
1269     felem_reduce(ftmp2, tmp);
1270
1271     /* V = ftmp4 = U1 * I */
1272     felem_mul(tmp, ftmp3, ftmp);
1273     felem_reduce(ftmp4, tmp);
1274
1275     /* x_out = r**2 - J - 2V */
1276     felem_square(tmp, ftmp5);
1277     /* tmp[i] < 17*2^122 */
1278     felem_diff_128_64(tmp, ftmp2);
1279     /* tmp[i] < 17*2^122 + 2^63 */
1280     felem_assign(ftmp3, ftmp4);
1281     felem_scalar64(ftmp4, 2);
1282     /* ftmp4[i] < 2^61 */
1283     felem_diff_128_64(tmp, ftmp4);
1284     /* tmp[i] < 17*2^122 + 2^64 */
1285     felem_reduce(x_out, tmp);
1286
1287     /* y_out = r(V-x_out) - 2 * s1 * J */
1288     felem_diff64(ftmp3, x_out);
1289     /*
1290      * ftmp3[i] < 2^60 + 2^60 = 2^61
1291      */
1292     felem_mul(tmp, ftmp5, ftmp3);
1293     /* tmp[i] < 17*2^122 */
1294     felem_mul(tmp2, ftmp6, ftmp2);
1295     /* tmp2[i] < 17*2^120 */
1296     felem_scalar128(tmp2, 2);
1297     /* tmp2[i] < 17*2^121 */
1298     felem_diff128(tmp, tmp2);
1299         /*-
1300          * tmp[i] < 2^127 - 2^69 + 17*2^122
1301          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1302          *        < 2^127
1303          */
1304     felem_reduce(y_out, tmp);
1305
1306     copy_conditional(x_out, x2, z1_is_zero);
1307     copy_conditional(x_out, x1, z2_is_zero);
1308     copy_conditional(y_out, y2, z1_is_zero);
1309     copy_conditional(y_out, y1, z2_is_zero);
1310     copy_conditional(z_out, z2, z1_is_zero);
1311     copy_conditional(z_out, z1, z2_is_zero);
1312     felem_assign(x3, x_out);
1313     felem_assign(y3, y_out);
1314     felem_assign(z3, z_out);
1315 }
1316
1317 /*-
1318  * Base point pre computation
1319  * --------------------------
1320  *
1321  * Two different sorts of precomputed tables are used in the following code.
1322  * Each contain various points on the curve, where each point is three field
1323  * elements (x, y, z).
1324  *
1325  * For the base point table, z is usually 1 (0 for the point at infinity).
1326  * This table has 16 elements:
1327  * index | bits    | point
1328  * ------+---------+------------------------------
1329  *     0 | 0 0 0 0 | 0G
1330  *     1 | 0 0 0 1 | 1G
1331  *     2 | 0 0 1 0 | 2^130G
1332  *     3 | 0 0 1 1 | (2^130 + 1)G
1333  *     4 | 0 1 0 0 | 2^260G
1334  *     5 | 0 1 0 1 | (2^260 + 1)G
1335  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1336  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1337  *     8 | 1 0 0 0 | 2^390G
1338  *     9 | 1 0 0 1 | (2^390 + 1)G
1339  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1340  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1341  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1342  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1343  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1344  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1345  *
1346  * The reason for this is so that we can clock bits into four different
1347  * locations when doing simple scalar multiplies against the base point.
1348  *
1349  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1350
1351 /* gmul is the table of precomputed base points */
1352 static const felem gmul[16][3] = {
1353 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1354  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1355  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1356 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1357   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1358   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1359  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1360   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1361   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1362  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1363 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1364   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1365   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1366  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1367   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1368   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1369  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1370 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1371   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1372   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1373  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1374   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1375   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1376  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1377 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1378   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1379   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1380  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1381   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1382   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1383  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1385   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1386   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1387  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1388   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1389   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1392   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1393   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1394  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1395   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1396   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1399   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1400   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1401  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1402   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1403   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1406   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1407   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1408  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1409   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1410   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1413   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1414   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1415  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1416   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1417   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1420   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1421   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1422  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1423   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1424   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1427   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1428   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1429  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1430   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1431   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1434   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1435   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1436  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1437   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1438   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1441   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1442   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1443  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1444   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1445   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1448   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1449   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1450  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1451   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1452   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1455   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1456   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1457  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1458   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1459   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1461 };
1462
1463 /*
1464  * select_point selects the |idx|th point from a precomputation table and
1465  * copies it to out.
1466  */
1467  /* pre_comp below is of the size provided in |size| */
1468 static void select_point(const limb idx, unsigned int size,
1469                          const felem pre_comp[][3], felem out[3])
1470 {
1471     unsigned i, j;
1472     limb *outlimbs = &out[0][0];
1473     memset(outlimbs, 0, 3 * sizeof(felem));
1474
1475     for (i = 0; i < size; i++) {
1476         const limb *inlimbs = &pre_comp[i][0][0];
1477         limb mask = i ^ idx;
1478         mask |= mask >> 4;
1479         mask |= mask >> 2;
1480         mask |= mask >> 1;
1481         mask &= 1;
1482         mask--;
1483         for (j = 0; j < NLIMBS * 3; j++)
1484             outlimbs[j] |= inlimbs[j] & mask;
1485     }
1486 }
1487
1488 /* get_bit returns the |i|th bit in |in| */
1489 static char get_bit(const felem_bytearray in, int i)
1490 {
1491     if (i < 0)
1492         return 0;
1493     return (in[i >> 3] >> (i & 7)) & 1;
1494 }
1495
1496 /*
1497  * Interleaved point multiplication using precomputed point multiples: The
1498  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1499  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1500  * generator, using certain (large) precomputed multiples in g_pre_comp.
1501  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1502  */
1503 static void batch_mul(felem x_out, felem y_out, felem z_out,
1504                       const felem_bytearray scalars[],
1505                       const unsigned num_points, const u8 *g_scalar,
1506                       const int mixed, const felem pre_comp[][17][3],
1507                       const felem g_pre_comp[16][3])
1508 {
1509     int i, skip;
1510     unsigned num, gen_mul = (g_scalar != NULL);
1511     felem nq[3], tmp[4];
1512     limb bits;
1513     u8 sign, digit;
1514
1515     /* set nq to the point at infinity */
1516     memset(nq, 0, 3 * sizeof(felem));
1517
1518     /*
1519      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1520      * of the generator (last quarter of rounds) and additions of other
1521      * points multiples (every 5th round).
1522      */
1523     skip = 1;                   /* save two point operations in the first
1524                                  * round */
1525     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1526         /* double */
1527         if (!skip)
1528             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1529
1530         /* add multiples of the generator */
1531         if (gen_mul && (i <= 130)) {
1532             bits = get_bit(g_scalar, i + 390) << 3;
1533             if (i < 130) {
1534                 bits |= get_bit(g_scalar, i + 260) << 2;
1535                 bits |= get_bit(g_scalar, i + 130) << 1;
1536                 bits |= get_bit(g_scalar, i);
1537             }
1538             /* select the point to add, in constant time */
1539             select_point(bits, 16, g_pre_comp, tmp);
1540             if (!skip) {
1541                 /* The 1 argument below is for "mixed" */
1542                 point_add(nq[0], nq[1], nq[2],
1543                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1544             } else {
1545                 memcpy(nq, tmp, 3 * sizeof(felem));
1546                 skip = 0;
1547             }
1548         }
1549
1550         /* do other additions every 5 doublings */
1551         if (num_points && (i % 5 == 0)) {
1552             /* loop over all scalars */
1553             for (num = 0; num < num_points; ++num) {
1554                 bits = get_bit(scalars[num], i + 4) << 5;
1555                 bits |= get_bit(scalars[num], i + 3) << 4;
1556                 bits |= get_bit(scalars[num], i + 2) << 3;
1557                 bits |= get_bit(scalars[num], i + 1) << 2;
1558                 bits |= get_bit(scalars[num], i) << 1;
1559                 bits |= get_bit(scalars[num], i - 1);
1560                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1561
1562                 /*
1563                  * select the point to add or subtract, in constant time
1564                  */
1565                 select_point(digit, 17, pre_comp[num], tmp);
1566                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1567                                             * point */
1568                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1569
1570                 if (!skip) {
1571                     point_add(nq[0], nq[1], nq[2],
1572                               nq[0], nq[1], nq[2],
1573                               mixed, tmp[0], tmp[1], tmp[2]);
1574                 } else {
1575                     memcpy(nq, tmp, 3 * sizeof(felem));
1576                     skip = 0;
1577                 }
1578             }
1579         }
1580     }
1581     felem_assign(x_out, nq[0]);
1582     felem_assign(y_out, nq[1]);
1583     felem_assign(z_out, nq[2]);
1584 }
1585
1586 /* Precomputation for the group generator. */
1587 typedef struct {
1588     felem g_pre_comp[16][3];
1589     int references;
1590 } NISTP521_PRE_COMP;
1591
1592 const EC_METHOD *EC_GFp_nistp521_method(void)
1593 {
1594     static const EC_METHOD ret = {
1595         EC_FLAGS_DEFAULT_OCT,
1596         NID_X9_62_prime_field,
1597         ec_GFp_nistp521_group_init,
1598         ec_GFp_simple_group_finish,
1599         ec_GFp_simple_group_clear_finish,
1600         ec_GFp_nist_group_copy,
1601         ec_GFp_nistp521_group_set_curve,
1602         ec_GFp_simple_group_get_curve,
1603         ec_GFp_simple_group_get_degree,
1604         ec_GFp_simple_group_check_discriminant,
1605         ec_GFp_simple_point_init,
1606         ec_GFp_simple_point_finish,
1607         ec_GFp_simple_point_clear_finish,
1608         ec_GFp_simple_point_copy,
1609         ec_GFp_simple_point_set_to_infinity,
1610         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1611         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1612         ec_GFp_simple_point_set_affine_coordinates,
1613         ec_GFp_nistp521_point_get_affine_coordinates,
1614         0 /* point_set_compressed_coordinates */ ,
1615         0 /* point2oct */ ,
1616         0 /* oct2point */ ,
1617         ec_GFp_simple_add,
1618         ec_GFp_simple_dbl,
1619         ec_GFp_simple_invert,
1620         ec_GFp_simple_is_at_infinity,
1621         ec_GFp_simple_is_on_curve,
1622         ec_GFp_simple_cmp,
1623         ec_GFp_simple_make_affine,
1624         ec_GFp_simple_points_make_affine,
1625         ec_GFp_nistp521_points_mul,
1626         ec_GFp_nistp521_precompute_mult,
1627         ec_GFp_nistp521_have_precompute_mult,
1628         ec_GFp_nist_field_mul,
1629         ec_GFp_nist_field_sqr,
1630         0 /* field_div */ ,
1631         0 /* field_encode */ ,
1632         0 /* field_decode */ ,
1633         0                       /* field_set_to_one */
1634     };
1635
1636     return &ret;
1637 }
1638
1639 /******************************************************************************/
1640 /*
1641  * FUNCTIONS TO MANAGE PRECOMPUTATION
1642  */
1643
1644 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1645 {
1646     NISTP521_PRE_COMP *ret = NULL;
1647     ret = OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1648     if (!ret) {
1649         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1650         return ret;
1651     }
1652     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1653     ret->references = 1;
1654     return ret;
1655 }
1656
1657 static void *nistp521_pre_comp_dup(void *src_)
1658 {
1659     NISTP521_PRE_COMP *src = src_;
1660
1661     /* no need to actually copy, these objects never change! */
1662     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1663
1664     return src_;
1665 }
1666
1667 static void nistp521_pre_comp_free(void *pre_)
1668 {
1669     int i;
1670     NISTP521_PRE_COMP *pre = pre_;
1671
1672     if (!pre)
1673         return;
1674
1675     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1676     if (i > 0)
1677         return;
1678
1679     OPENSSL_free(pre);
1680 }
1681
1682 static void nistp521_pre_comp_clear_free(void *pre_)
1683 {
1684     int i;
1685     NISTP521_PRE_COMP *pre = pre_;
1686
1687     if (!pre)
1688         return;
1689
1690     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1691     if (i > 0)
1692         return;
1693
1694     OPENSSL_clear_free(pre, sizeof(*pre));
1695 }
1696
1697 /******************************************************************************/
1698 /*
1699  * OPENSSL EC_METHOD FUNCTIONS
1700  */
1701
1702 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1703 {
1704     int ret;
1705     ret = ec_GFp_simple_group_init(group);
1706     group->a_is_minus3 = 1;
1707     return ret;
1708 }
1709
1710 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1711                                     const BIGNUM *a, const BIGNUM *b,
1712                                     BN_CTX *ctx)
1713 {
1714     int ret = 0;
1715     BN_CTX *new_ctx = NULL;
1716     BIGNUM *curve_p, *curve_a, *curve_b;
1717
1718     if (ctx == NULL)
1719         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1720             return 0;
1721     BN_CTX_start(ctx);
1722     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1723         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1724         ((curve_b = BN_CTX_get(ctx)) == NULL))
1725         goto err;
1726     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1727     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1728     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1729     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1730         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1731               EC_R_WRONG_CURVE_PARAMETERS);
1732         goto err;
1733     }
1734     group->field_mod_func = BN_nist_mod_521;
1735     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1736  err:
1737     BN_CTX_end(ctx);
1738     BN_CTX_free(new_ctx);
1739     return ret;
1740 }
1741
1742 /*
1743  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1744  * (X/Z^2, Y/Z^3)
1745  */
1746 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1747                                                  const EC_POINT *point,
1748                                                  BIGNUM *x, BIGNUM *y,
1749                                                  BN_CTX *ctx)
1750 {
1751     felem z1, z2, x_in, y_in, x_out, y_out;
1752     largefelem tmp;
1753
1754     if (EC_POINT_is_at_infinity(group, point)) {
1755         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1756               EC_R_POINT_AT_INFINITY);
1757         return 0;
1758     }
1759     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1760         (!BN_to_felem(z1, point->Z)))
1761         return 0;
1762     felem_inv(z2, z1);
1763     felem_square(tmp, z2);
1764     felem_reduce(z1, tmp);
1765     felem_mul(tmp, x_in, z1);
1766     felem_reduce(x_in, tmp);
1767     felem_contract(x_out, x_in);
1768     if (x != NULL) {
1769         if (!felem_to_BN(x, x_out)) {
1770             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1771                   ERR_R_BN_LIB);
1772             return 0;
1773         }
1774     }
1775     felem_mul(tmp, z1, z2);
1776     felem_reduce(z1, tmp);
1777     felem_mul(tmp, y_in, z1);
1778     felem_reduce(y_in, tmp);
1779     felem_contract(y_out, y_in);
1780     if (y != NULL) {
1781         if (!felem_to_BN(y, y_out)) {
1782             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1783                   ERR_R_BN_LIB);
1784             return 0;
1785         }
1786     }
1787     return 1;
1788 }
1789
1790 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1791 static void make_points_affine(size_t num, felem points[][3],
1792                                felem tmp_felems[])
1793 {
1794     /*
1795      * Runs in constant time, unless an input is the point at infinity (which
1796      * normally shouldn't happen).
1797      */
1798     ec_GFp_nistp_points_make_affine_internal(num,
1799                                              points,
1800                                              sizeof(felem),
1801                                              tmp_felems,
1802                                              (void (*)(void *))felem_one,
1803                                              (int (*)(const void *))
1804                                              felem_is_zero_int,
1805                                              (void (*)(void *, const void *))
1806                                              felem_assign,
1807                                              (void (*)(void *, const void *))
1808                                              felem_square_reduce, (void (*)
1809                                                                    (void *,
1810                                                                     const void
1811                                                                     *,
1812                                                                     const void
1813                                                                     *))
1814                                              felem_mul_reduce,
1815                                              (void (*)(void *, const void *))
1816                                              felem_inv,
1817                                              (void (*)(void *, const void *))
1818                                              felem_contract);
1819 }
1820
1821 /*
1822  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1823  * values Result is stored in r (r can equal one of the inputs).
1824  */
1825 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1826                                const BIGNUM *scalar, size_t num,
1827                                const EC_POINT *points[],
1828                                const BIGNUM *scalars[], BN_CTX *ctx)
1829 {
1830     int ret = 0;
1831     int j;
1832     int mixed = 0;
1833     BN_CTX *new_ctx = NULL;
1834     BIGNUM *x, *y, *z, *tmp_scalar;
1835     felem_bytearray g_secret;
1836     felem_bytearray *secrets = NULL;
1837     felem(*pre_comp)[17][3] = NULL;
1838     felem *tmp_felems = NULL;
1839     felem_bytearray tmp;
1840     unsigned i, num_bytes;
1841     int have_pre_comp = 0;
1842     size_t num_points = num;
1843     felem x_in, y_in, z_in, x_out, y_out, z_out;
1844     NISTP521_PRE_COMP *pre = NULL;
1845     felem(*g_pre_comp)[3] = NULL;
1846     EC_POINT *generator = NULL;
1847     const EC_POINT *p = NULL;
1848     const BIGNUM *p_scalar = NULL;
1849
1850     if (ctx == NULL)
1851         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1852             return 0;
1853     BN_CTX_start(ctx);
1854     if (((x = BN_CTX_get(ctx)) == NULL) ||
1855         ((y = BN_CTX_get(ctx)) == NULL) ||
1856         ((z = BN_CTX_get(ctx)) == NULL) ||
1857         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1858         goto err;
1859
1860     if (scalar != NULL) {
1861         pre = EC_EX_DATA_get_data(group->extra_data,
1862                                   nistp521_pre_comp_dup,
1863                                   nistp521_pre_comp_free,
1864                                   nistp521_pre_comp_clear_free);
1865         if (pre)
1866             /* we have precomputation, try to use it */
1867             g_pre_comp = &pre->g_pre_comp[0];
1868         else
1869             /* try to use the standard precomputation */
1870             g_pre_comp = (felem(*)[3]) gmul;
1871         generator = EC_POINT_new(group);
1872         if (generator == NULL)
1873             goto err;
1874         /* get the generator from precomputation */
1875         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1876             !felem_to_BN(y, g_pre_comp[1][1]) ||
1877             !felem_to_BN(z, g_pre_comp[1][2])) {
1878             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1879             goto err;
1880         }
1881         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1882                                                       generator, x, y, z,
1883                                                       ctx))
1884             goto err;
1885         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1886             /* precomputation matches generator */
1887             have_pre_comp = 1;
1888         else
1889             /*
1890              * we don't have valid precomputation: treat the generator as a
1891              * random point
1892              */
1893             num_points++;
1894     }
1895
1896     if (num_points > 0) {
1897         if (num_points >= 2) {
1898             /*
1899              * unless we precompute multiples for just one point, converting
1900              * those into affine form is time well spent
1901              */
1902             mixed = 1;
1903         }
1904         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1905         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1906         if (mixed)
1907             tmp_felems =
1908                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1909         if ((secrets == NULL) || (pre_comp == NULL)
1910             || (mixed && (tmp_felems == NULL))) {
1911             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1912             goto err;
1913         }
1914
1915         /*
1916          * we treat NULL scalars as 0, and NULL points as points at infinity,
1917          * i.e., they contribute nothing to the linear combination
1918          */
1919         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1920         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1921         for (i = 0; i < num_points; ++i) {
1922             if (i == num)
1923                 /*
1924                  * we didn't have a valid precomputation, so we pick the
1925                  * generator
1926                  */
1927             {
1928                 p = EC_GROUP_get0_generator(group);
1929                 p_scalar = scalar;
1930             } else
1931                 /* the i^th point */
1932             {
1933                 p = points[i];
1934                 p_scalar = scalars[i];
1935             }
1936             if ((p_scalar != NULL) && (p != NULL)) {
1937                 /* reduce scalar to 0 <= scalar < 2^521 */
1938                 if ((BN_num_bits(p_scalar) > 521)
1939                     || (BN_is_negative(p_scalar))) {
1940                     /*
1941                      * this is an unusual input, and we don't guarantee
1942                      * constant-timeness
1943                      */
1944                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1945                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1946                         goto err;
1947                     }
1948                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1949                 } else
1950                     num_bytes = BN_bn2bin(p_scalar, tmp);
1951                 flip_endian(secrets[i], tmp, num_bytes);
1952                 /* precompute multiples */
1953                 if ((!BN_to_felem(x_out, p->X)) ||
1954                     (!BN_to_felem(y_out, p->Y)) ||
1955                     (!BN_to_felem(z_out, p->Z)))
1956                     goto err;
1957                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1958                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1959                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1960                 for (j = 2; j <= 16; ++j) {
1961                     if (j & 1) {
1962                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1963                                   pre_comp[i][j][2], pre_comp[i][1][0],
1964                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1965                                   pre_comp[i][j - 1][0],
1966                                   pre_comp[i][j - 1][1],
1967                                   pre_comp[i][j - 1][2]);
1968                     } else {
1969                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1970                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1971                                      pre_comp[i][j / 2][1],
1972                                      pre_comp[i][j / 2][2]);
1973                     }
1974                 }
1975             }
1976         }
1977         if (mixed)
1978             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1979     }
1980
1981     /* the scalar for the generator */
1982     if ((scalar != NULL) && (have_pre_comp)) {
1983         memset(g_secret, 0, sizeof(g_secret));
1984         /* reduce scalar to 0 <= scalar < 2^521 */
1985         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1986             /*
1987              * this is an unusual input, and we don't guarantee
1988              * constant-timeness
1989              */
1990             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1991                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1992                 goto err;
1993             }
1994             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1995         } else
1996             num_bytes = BN_bn2bin(scalar, tmp);
1997         flip_endian(g_secret, tmp, num_bytes);
1998         /* do the multiplication with generator precomputation */
1999         batch_mul(x_out, y_out, z_out,
2000                   (const felem_bytearray(*))secrets, num_points,
2001                   g_secret,
2002                   mixed, (const felem(*)[17][3])pre_comp,
2003                   (const felem(*)[3])g_pre_comp);
2004     } else
2005         /* do the multiplication without generator precomputation */
2006         batch_mul(x_out, y_out, z_out,
2007                   (const felem_bytearray(*))secrets, num_points,
2008                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2009     /* reduce the output to its unique minimal representation */
2010     felem_contract(x_in, x_out);
2011     felem_contract(y_in, y_out);
2012     felem_contract(z_in, z_out);
2013     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2014         (!felem_to_BN(z, z_in))) {
2015         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2016         goto err;
2017     }
2018     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2019
2020  err:
2021     BN_CTX_end(ctx);
2022     EC_POINT_free(generator);
2023     BN_CTX_free(new_ctx);
2024     if (secrets != NULL)
2025         OPENSSL_free(secrets);
2026     if (pre_comp != NULL)
2027         OPENSSL_free(pre_comp);
2028     if (tmp_felems != NULL)
2029         OPENSSL_free(tmp_felems);
2030     return ret;
2031 }
2032
2033 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2034 {
2035     int ret = 0;
2036     NISTP521_PRE_COMP *pre = NULL;
2037     int i, j;
2038     BN_CTX *new_ctx = NULL;
2039     BIGNUM *x, *y;
2040     EC_POINT *generator = NULL;
2041     felem tmp_felems[16];
2042
2043     /* throw away old precomputation */
2044     EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2045                          nistp521_pre_comp_free,
2046                          nistp521_pre_comp_clear_free);
2047     if (ctx == NULL)
2048         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2049             return 0;
2050     BN_CTX_start(ctx);
2051     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2052         goto err;
2053     /* get the generator */
2054     if (group->generator == NULL)
2055         goto err;
2056     generator = EC_POINT_new(group);
2057     if (generator == NULL)
2058         goto err;
2059     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2060     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2061     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2062         goto err;
2063     if ((pre = nistp521_pre_comp_new()) == NULL)
2064         goto err;
2065     /*
2066      * if the generator is the standard one, use built-in precomputation
2067      */
2068     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2069         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2070         ret = 1;
2071         goto err;
2072     }
2073     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2074         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2075         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2076         goto err;
2077     /* compute 2^130*G, 2^260*G, 2^390*G */
2078     for (i = 1; i <= 4; i <<= 1) {
2079         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2080                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2081                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2082         for (j = 0; j < 129; ++j) {
2083             point_double(pre->g_pre_comp[2 * i][0],
2084                          pre->g_pre_comp[2 * i][1],
2085                          pre->g_pre_comp[2 * i][2],
2086                          pre->g_pre_comp[2 * i][0],
2087                          pre->g_pre_comp[2 * i][1],
2088                          pre->g_pre_comp[2 * i][2]);
2089         }
2090     }
2091     /* g_pre_comp[0] is the point at infinity */
2092     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2093     /* the remaining multiples */
2094     /* 2^130*G + 2^260*G */
2095     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2096               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2097               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2098               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2099               pre->g_pre_comp[2][2]);
2100     /* 2^130*G + 2^390*G */
2101     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2102               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2103               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2104               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2105               pre->g_pre_comp[2][2]);
2106     /* 2^260*G + 2^390*G */
2107     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2108               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2109               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2110               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2111               pre->g_pre_comp[4][2]);
2112     /* 2^130*G + 2^260*G + 2^390*G */
2113     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2114               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2115               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2116               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2117               pre->g_pre_comp[2][2]);
2118     for (i = 1; i < 8; ++i) {
2119         /* odd multiples: add G */
2120         point_add(pre->g_pre_comp[2 * i + 1][0],
2121                   pre->g_pre_comp[2 * i + 1][1],
2122                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2123                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2124                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2125                   pre->g_pre_comp[1][2]);
2126     }
2127     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2128
2129     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2130                              nistp521_pre_comp_free,
2131                              nistp521_pre_comp_clear_free))
2132         goto err;
2133     ret = 1;
2134     pre = NULL;
2135  err:
2136     BN_CTX_end(ctx);
2137     EC_POINT_free(generator);
2138     BN_CTX_free(new_ctx);
2139     nistp521_pre_comp_free(pre);
2140     return ret;
2141 }
2142
2143 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2144 {
2145     if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2146                             nistp521_pre_comp_free,
2147                             nistp521_pre_comp_clear_free)
2148         != NULL)
2149         return 1;
2150     else
2151         return 0;
2152 }
2153
2154 #else
2155 static void *dummy = &dummy;
2156 #endif