crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # ifndef OPENSSL_SYS_VMS
  33 #  include <stdint.h>
  34 # else
  35 #  include <inttypes.h>
  36 # endif
  37
  38 # include <string.h>
  39 # include <openssl/err.h>
  40 # include "ec_lcl.h"
  41
  42 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  45                                  * platforms */
  46 # else
  47 #  error "Need GCC 3.1 or later to define type uint128_t"
  48 # endif
  49
  50 typedef uint8_t u8;
  51 typedef uint64_t u64;
  52
  53 /*
  54  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  55  * element of this field into 66 bytes where the most significant byte
  56  * contains only a single bit. We call this an felem_bytearray.
  57  */
  58
  59 typedef u8 felem_bytearray[66];
  60
  61 /*
  62  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  63  * These values are big-endian.
  64  */
  65 static const felem_bytearray nistp521_curve_params[5] = {
  66     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff},
  75     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xfc},
  84     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  85      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  86      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  87      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  88      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  89      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  90      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  91      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  92      0x3f, 0x00},
  93     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  94      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  95      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  96      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  97      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  98      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  99      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 100      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 101      0xbd, 0x66},
 102     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 103      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 104      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 105      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 106      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 107      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 108      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 109      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 110      0x66, 0x50}
 111 };
 112
 113 /*-
 114  * The representation of field elements.
 115  * ------------------------------------
 116  *
 117  * We represent field elements with nine values. These values are either 64 or
 118  * 128 bits and the field element represented is:
 119  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 120  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 121  * 58 bits apart, but are greater than 58 bits in length, the most significant
 122  * bits of each limb overlap with the least significant bits of the next.
 123  *
 124  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 125  * 'largefelem' */
 126
 127 # define NLIMBS 9
 128
 129 typedef uint64_t limb;
 130 typedef limb felem[NLIMBS];
 131 typedef uint128_t largefelem[NLIMBS];
 132
 133 static const limb bottom57bits = 0x1ffffffffffffff;
 134 static const limb bottom58bits = 0x3ffffffffffffff;
 135
 136 /*
 137  * bin66_to_felem takes a little-endian byte array and converts it into felem
 138  * form. This assumes that the CPU is little-endian.
 139  */
 140 static void bin66_to_felem(felem out, const u8 in[66])
 141 {
 142     out[0] = (*((limb *) & in[0])) & bottom58bits;
 143     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 144     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 145     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 146     out[4] = (*((limb *) & in[29])) & bottom58bits;
 147     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 148     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 149     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 150     out[8] = (*((limb *) & in[58])) & bottom57bits;
 151 }
 152
 153 /*
 154  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 155  * array. This assumes that the CPU is little-endian.
 156  */
 157 static void felem_to_bin66(u8 out[66], const felem in)
 158 {
 159     memset(out, 0, 66);
 160     (*((limb *) & out[0])) = in[0];
 161     (*((limb *) & out[7])) |= in[1] << 2;
 162     (*((limb *) & out[14])) |= in[2] << 4;
 163     (*((limb *) & out[21])) |= in[3] << 6;
 164     (*((limb *) & out[29])) = in[4];
 165     (*((limb *) & out[36])) |= in[5] << 2;
 166     (*((limb *) & out[43])) |= in[6] << 4;
 167     (*((limb *) & out[50])) |= in[7] << 6;
 168     (*((limb *) & out[58])) = in[8];
 169 }
 170
 171 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 172 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 173 {
 174     unsigned i;
 175     for (i = 0; i < len; ++i)
 176         out[i] = in[len - 1 - i];
 177 }
 178
 179 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 180 static int BN_to_felem(felem out, const BIGNUM *bn)
 181 {
 182     felem_bytearray b_in;
 183     felem_bytearray b_out;
 184     unsigned num_bytes;
 185
 186     /* BN_bn2bin eats leading zeroes */
 187     memset(b_out, 0, sizeof(b_out));
 188     num_bytes = BN_num_bytes(bn);
 189     if (num_bytes > sizeof(b_out)) {
 190         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 191         return 0;
 192     }
 193     if (BN_is_negative(bn)) {
 194         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 195         return 0;
 196     }
 197     num_bytes = BN_bn2bin(bn, b_in);
 198     flip_endian(b_out, b_in, num_bytes);
 199     bin66_to_felem(out, b_out);
 200     return 1;
 201 }
 202
 203 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 204 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 205 {
 206     felem_bytearray b_in, b_out;
 207     felem_to_bin66(b_in, in);
 208     flip_endian(b_out, b_in, sizeof(b_out));
 209     return BN_bin2bn(b_out, sizeof(b_out), out);
 210 }
 211
 212 /*-
 213  * Field operations
 214  * ----------------
 215  */
 216
 217 static void felem_one(felem out)
 218 {
 219     out[0] = 1;
 220     out[1] = 0;
 221     out[2] = 0;
 222     out[3] = 0;
 223     out[4] = 0;
 224     out[5] = 0;
 225     out[6] = 0;
 226     out[7] = 0;
 227     out[8] = 0;
 228 }
 229
 230 static void felem_assign(felem out, const felem in)
 231 {
 232     out[0] = in[0];
 233     out[1] = in[1];
 234     out[2] = in[2];
 235     out[3] = in[3];
 236     out[4] = in[4];
 237     out[5] = in[5];
 238     out[6] = in[6];
 239     out[7] = in[7];
 240     out[8] = in[8];
 241 }
 242
 243 /* felem_sum64 sets out = out + in. */
 244 static void felem_sum64(felem out, const felem in)
 245 {
 246     out[0] += in[0];
 247     out[1] += in[1];
 248     out[2] += in[2];
 249     out[3] += in[3];
 250     out[4] += in[4];
 251     out[5] += in[5];
 252     out[6] += in[6];
 253     out[7] += in[7];
 254     out[8] += in[8];
 255 }
 256
 257 /* felem_scalar sets out = in * scalar */
 258 static void felem_scalar(felem out, const felem in, limb scalar)
 259 {
 260     out[0] = in[0] * scalar;
 261     out[1] = in[1] * scalar;
 262     out[2] = in[2] * scalar;
 263     out[3] = in[3] * scalar;
 264     out[4] = in[4] * scalar;
 265     out[5] = in[5] * scalar;
 266     out[6] = in[6] * scalar;
 267     out[7] = in[7] * scalar;
 268     out[8] = in[8] * scalar;
 269 }
 270
 271 /* felem_scalar64 sets out = out * scalar */
 272 static void felem_scalar64(felem out, limb scalar)
 273 {
 274     out[0] *= scalar;
 275     out[1] *= scalar;
 276     out[2] *= scalar;
 277     out[3] *= scalar;
 278     out[4] *= scalar;
 279     out[5] *= scalar;
 280     out[6] *= scalar;
 281     out[7] *= scalar;
 282     out[8] *= scalar;
 283 }
 284
 285 /* felem_scalar128 sets out = out * scalar */
 286 static void felem_scalar128(largefelem out, limb scalar)
 287 {
 288     out[0] *= scalar;
 289     out[1] *= scalar;
 290     out[2] *= scalar;
 291     out[3] *= scalar;
 292     out[4] *= scalar;
 293     out[5] *= scalar;
 294     out[6] *= scalar;
 295     out[7] *= scalar;
 296     out[8] *= scalar;
 297 }
 298
 299 /*-
 300  * felem_neg sets |out| to |-in|
 301  * On entry:
 302  *   in[i] < 2^59 + 2^14
 303  * On exit:
 304  *   out[i] < 2^62
 305  */
 306 static void felem_neg(felem out, const felem in)
 307 {
 308     /* In order to prevent underflow, we subtract from 0 mod p. */
 309     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 310     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 311
 312     out[0] = two62m3 - in[0];
 313     out[1] = two62m2 - in[1];
 314     out[2] = two62m2 - in[2];
 315     out[3] = two62m2 - in[3];
 316     out[4] = two62m2 - in[4];
 317     out[5] = two62m2 - in[5];
 318     out[6] = two62m2 - in[6];
 319     out[7] = two62m2 - in[7];
 320     out[8] = two62m2 - in[8];
 321 }
 322
 323 /*-
 324  * felem_diff64 subtracts |in| from |out|
 325  * On entry:
 326  *   in[i] < 2^59 + 2^14
 327  * On exit:
 328  *   out[i] < out[i] + 2^62
 329  */
 330 static void felem_diff64(felem out, const felem in)
 331 {
 332     /*
 333      * In order to prevent underflow, we add 0 mod p before subtracting.
 334      */
 335     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 336     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 337
 338     out[0] += two62m3 - in[0];
 339     out[1] += two62m2 - in[1];
 340     out[2] += two62m2 - in[2];
 341     out[3] += two62m2 - in[3];
 342     out[4] += two62m2 - in[4];
 343     out[5] += two62m2 - in[5];
 344     out[6] += two62m2 - in[6];
 345     out[7] += two62m2 - in[7];
 346     out[8] += two62m2 - in[8];
 347 }
 348
 349 /*-
 350  * felem_diff_128_64 subtracts |in| from |out|
 351  * On entry:
 352  *   in[i] < 2^62 + 2^17
 353  * On exit:
 354  *   out[i] < out[i] + 2^63
 355  */
 356 static void felem_diff_128_64(largefelem out, const felem in)
 357 {
 358     /*
 359      * In order to prevent underflow, we add 64p mod p (which is equivalent
 360      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 361      * digit number with all bits set to 1. See "The representation of field
 362      * elements" comment above for a description of how limbs are used to
 363      * represent a number. 64p is represented with 8 limbs containing a number
 364      * with 58 bits set and one limb with a number with 57 bits set.
 365      */
 366     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 367     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 368
 369     out[0] += two63m6 - in[0];
 370     out[1] += two63m5 - in[1];
 371     out[2] += two63m5 - in[2];
 372     out[3] += two63m5 - in[3];
 373     out[4] += two63m5 - in[4];
 374     out[5] += two63m5 - in[5];
 375     out[6] += two63m5 - in[6];
 376     out[7] += two63m5 - in[7];
 377     out[8] += two63m5 - in[8];
 378 }
 379
 380 /*-
 381  * felem_diff_128_64 subtracts |in| from |out|
 382  * On entry:
 383  *   in[i] < 2^126
 384  * On exit:
 385  *   out[i] < out[i] + 2^127 - 2^69
 386  */
 387 static void felem_diff128(largefelem out, const largefelem in)
 388 {
 389     /*
 390      * In order to prevent underflow, we add 0 mod p before subtracting.
 391      */
 392     static const uint128_t two127m70 =
 393         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 394     static const uint128_t two127m69 =
 395         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 396
 397     out[0] += (two127m70 - in[0]);
 398     out[1] += (two127m69 - in[1]);
 399     out[2] += (two127m69 - in[2]);
 400     out[3] += (two127m69 - in[3]);
 401     out[4] += (two127m69 - in[4]);
 402     out[5] += (two127m69 - in[5]);
 403     out[6] += (two127m69 - in[6]);
 404     out[7] += (two127m69 - in[7]);
 405     out[8] += (two127m69 - in[8]);
 406 }
 407
 408 /*-
 409  * felem_square sets |out| = |in|^2
 410  * On entry:
 411  *   in[i] < 2^62
 412  * On exit:
 413  *   out[i] < 17 * max(in[i]) * max(in[i])
 414  */
 415 static void felem_square(largefelem out, const felem in)
 416 {
 417     felem inx2, inx4;
 418     felem_scalar(inx2, in, 2);
 419     felem_scalar(inx4, in, 4);
 420
 421     /*-
 422      * We have many cases were we want to do
 423      *   in[x] * in[y] +
 424      *   in[y] * in[x]
 425      * This is obviously just
 426      *   2 * in[x] * in[y]
 427      * However, rather than do the doubling on the 128 bit result, we
 428      * double one of the inputs to the multiplication by reading from
 429      * |inx2|
 430      */
 431
 432     out[0] = ((uint128_t) in[0]) * in[0];
 433     out[1] = ((uint128_t) in[0]) * inx2[1];
 434     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 435     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 436     out[4] = ((uint128_t) in[0]) * inx2[4] +
 437         ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 438     out[5] = ((uint128_t) in[0]) * inx2[5] +
 439         ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 440     out[6] = ((uint128_t) in[0]) * inx2[6] +
 441         ((uint128_t) in[1]) * inx2[5] +
 442         ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 443     out[7] = ((uint128_t) in[0]) * inx2[7] +
 444         ((uint128_t) in[1]) * inx2[6] +
 445         ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 446     out[8] = ((uint128_t) in[0]) * inx2[8] +
 447         ((uint128_t) in[1]) * inx2[7] +
 448         ((uint128_t) in[2]) * inx2[6] +
 449         ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 450
 451     /*
 452      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 453      * They correspond to locations one bit up from the limbs produced above
 454      * so we would have to multiply by two to align them. Again, rather than
 455      * operate on the 128-bit result, we double one of the inputs to the
 456      * multiplication. If we want to double for both this reason, and the
 457      * reason above, then we end up multiplying by four.
 458      */
 459
 460     /* 9 */
 461     out[0] += ((uint128_t) in[1]) * inx4[8] +
 462         ((uint128_t) in[2]) * inx4[7] +
 463         ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 464
 465     /* 10 */
 466     out[1] += ((uint128_t) in[2]) * inx4[8] +
 467         ((uint128_t) in[3]) * inx4[7] +
 468         ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 469
 470     /* 11 */
 471     out[2] += ((uint128_t) in[3]) * inx4[8] +
 472         ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 473
 474     /* 12 */
 475     out[3] += ((uint128_t) in[4]) * inx4[8] +
 476         ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 477
 478     /* 13 */
 479     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 480
 481     /* 14 */
 482     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 483
 484     /* 15 */
 485     out[6] += ((uint128_t) in[7]) * inx4[8];
 486
 487     /* 16 */
 488     out[7] += ((uint128_t) in[8]) * inx2[8];
 489 }
 490
 491 /*-
 492  * felem_mul sets |out| = |in1| * |in2|
 493  * On entry:
 494  *   in1[i] < 2^64
 495  *   in2[i] < 2^63
 496  * On exit:
 497  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 498  */
 499 static void felem_mul(largefelem out, const felem in1, const felem in2)
 500 {
 501     felem in2x2;
 502     felem_scalar(in2x2, in2, 2);
 503
 504     out[0] = ((uint128_t) in1[0]) * in2[0];
 505
 506     out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
 507
 508     out[2] = ((uint128_t) in1[0]) * in2[2] +
 509         ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
 510
 511     out[3] = ((uint128_t) in1[0]) * in2[3] +
 512         ((uint128_t) in1[1]) * in2[2] +
 513         ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
 514
 515     out[4] = ((uint128_t) in1[0]) * in2[4] +
 516         ((uint128_t) in1[1]) * in2[3] +
 517         ((uint128_t) in1[2]) * in2[2] +
 518         ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
 519
 520     out[5] = ((uint128_t) in1[0]) * in2[5] +
 521         ((uint128_t) in1[1]) * in2[4] +
 522         ((uint128_t) in1[2]) * in2[3] +
 523         ((uint128_t) in1[3]) * in2[2] +
 524         ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
 525
 526     out[6] = ((uint128_t) in1[0]) * in2[6] +
 527         ((uint128_t) in1[1]) * in2[5] +
 528         ((uint128_t) in1[2]) * in2[4] +
 529         ((uint128_t) in1[3]) * in2[3] +
 530         ((uint128_t) in1[4]) * in2[2] +
 531         ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
 532
 533     out[7] = ((uint128_t) in1[0]) * in2[7] +
 534         ((uint128_t) in1[1]) * in2[6] +
 535         ((uint128_t) in1[2]) * in2[5] +
 536         ((uint128_t) in1[3]) * in2[4] +
 537         ((uint128_t) in1[4]) * in2[3] +
 538         ((uint128_t) in1[5]) * in2[2] +
 539         ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
 540
 541     out[8] = ((uint128_t) in1[0]) * in2[8] +
 542         ((uint128_t) in1[1]) * in2[7] +
 543         ((uint128_t) in1[2]) * in2[6] +
 544         ((uint128_t) in1[3]) * in2[5] +
 545         ((uint128_t) in1[4]) * in2[4] +
 546         ((uint128_t) in1[5]) * in2[3] +
 547         ((uint128_t) in1[6]) * in2[2] +
 548         ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
 549
 550     /* See comment in felem_square about the use of in2x2 here */
 551
 552     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 553         ((uint128_t) in1[2]) * in2x2[7] +
 554         ((uint128_t) in1[3]) * in2x2[6] +
 555         ((uint128_t) in1[4]) * in2x2[5] +
 556         ((uint128_t) in1[5]) * in2x2[4] +
 557         ((uint128_t) in1[6]) * in2x2[3] +
 558         ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
 559
 560     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 561         ((uint128_t) in1[3]) * in2x2[7] +
 562         ((uint128_t) in1[4]) * in2x2[6] +
 563         ((uint128_t) in1[5]) * in2x2[5] +
 564         ((uint128_t) in1[6]) * in2x2[4] +
 565         ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
 566
 567     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 568         ((uint128_t) in1[4]) * in2x2[7] +
 569         ((uint128_t) in1[5]) * in2x2[6] +
 570         ((uint128_t) in1[6]) * in2x2[5] +
 571         ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
 572
 573     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 574         ((uint128_t) in1[5]) * in2x2[7] +
 575         ((uint128_t) in1[6]) * in2x2[6] +
 576         ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
 577
 578     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 579         ((uint128_t) in1[6]) * in2x2[7] +
 580         ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
 581
 582     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 583         ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
 584
 585     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 586         ((uint128_t) in1[8]) * in2x2[7];
 587
 588     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 589 }
 590
 591 static const limb bottom52bits = 0xfffffffffffff;
 592
 593 /*-
 594  * felem_reduce converts a largefelem to an felem.
 595  * On entry:
 596  *   in[i] < 2^128
 597  * On exit:
 598  *   out[i] < 2^59 + 2^14
 599  */
 600 static void felem_reduce(felem out, const largefelem in)
 601 {
 602     u64 overflow1, overflow2;
 603
 604     out[0] = ((limb) in[0]) & bottom58bits;
 605     out[1] = ((limb) in[1]) & bottom58bits;
 606     out[2] = ((limb) in[2]) & bottom58bits;
 607     out[3] = ((limb) in[3]) & bottom58bits;
 608     out[4] = ((limb) in[4]) & bottom58bits;
 609     out[5] = ((limb) in[5]) & bottom58bits;
 610     out[6] = ((limb) in[6]) & bottom58bits;
 611     out[7] = ((limb) in[7]) & bottom58bits;
 612     out[8] = ((limb) in[8]) & bottom58bits;
 613
 614     /* out[i] < 2^58 */
 615
 616     out[1] += ((limb) in[0]) >> 58;
 617     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 618     /*-
 619      * out[1] < 2^58 + 2^6 + 2^58
 620      *        = 2^59 + 2^6
 621      */
 622     out[2] += ((limb) (in[0] >> 64)) >> 52;
 623
 624     out[2] += ((limb) in[1]) >> 58;
 625     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 626     out[3] += ((limb) (in[1] >> 64)) >> 52;
 627
 628     out[3] += ((limb) in[2]) >> 58;
 629     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 630     out[4] += ((limb) (in[2] >> 64)) >> 52;
 631
 632     out[4] += ((limb) in[3]) >> 58;
 633     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 634     out[5] += ((limb) (in[3] >> 64)) >> 52;
 635
 636     out[5] += ((limb) in[4]) >> 58;
 637     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 638     out[6] += ((limb) (in[4] >> 64)) >> 52;
 639
 640     out[6] += ((limb) in[5]) >> 58;
 641     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 642     out[7] += ((limb) (in[5] >> 64)) >> 52;
 643
 644     out[7] += ((limb) in[6]) >> 58;
 645     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 646     out[8] += ((limb) (in[6] >> 64)) >> 52;
 647
 648     out[8] += ((limb) in[7]) >> 58;
 649     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 650     /*-
 651      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 652      *            < 2^59 + 2^13
 653      */
 654     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 655
 656     overflow1 += ((limb) in[8]) >> 58;
 657     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 658     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 659
 660     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 661     overflow2 <<= 1;            /* overflow2 < 2^13 */
 662
 663     out[0] += overflow1;        /* out[0] < 2^60 */
 664     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 665
 666     out[1] += out[0] >> 58;
 667     out[0] &= bottom58bits;
 668     /*-
 669      * out[0] < 2^58
 670      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 671      *        < 2^59 + 2^14
 672      */
 673 }
 674
 675 static void felem_square_reduce(felem out, const felem in)
 676 {
 677     largefelem tmp;
 678     felem_square(tmp, in);
 679     felem_reduce(out, tmp);
 680 }
 681
 682 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 683 {
 684     largefelem tmp;
 685     felem_mul(tmp, in1, in2);
 686     felem_reduce(out, tmp);
 687 }
 688
 689 /*-
 690  * felem_inv calculates |out| = |in|^{-1}
 691  *
 692  * Based on Fermat's Little Theorem:
 693  *   a^p = a (mod p)
 694  *   a^{p-1} = 1 (mod p)
 695  *   a^{p-2} = a^{-1} (mod p)
 696  */
 697 static void felem_inv(felem out, const felem in)
 698 {
 699     felem ftmp, ftmp2, ftmp3, ftmp4;
 700     largefelem tmp;
 701     unsigned i;
 702
 703     felem_square(tmp, in);
 704     felem_reduce(ftmp, tmp);    /* 2^1 */
 705     felem_mul(tmp, in, ftmp);
 706     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 707     felem_assign(ftmp2, ftmp);
 708     felem_square(tmp, ftmp);
 709     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 710     felem_mul(tmp, in, ftmp);
 711     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 712     felem_square(tmp, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 714
 715     felem_square(tmp, ftmp2);
 716     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 717     felem_square(tmp, ftmp3);
 718     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 719     felem_mul(tmp, ftmp3, ftmp2);
 720     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 721
 722     felem_assign(ftmp2, ftmp3);
 723     felem_square(tmp, ftmp3);
 724     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 725     felem_square(tmp, ftmp3);
 726     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 727     felem_square(tmp, ftmp3);
 728     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 729     felem_square(tmp, ftmp3);
 730     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 731     felem_assign(ftmp4, ftmp3);
 732     felem_mul(tmp, ftmp3, ftmp);
 733     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 734     felem_square(tmp, ftmp4);
 735     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 736     felem_mul(tmp, ftmp3, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 738     felem_assign(ftmp2, ftmp3);
 739
 740     for (i = 0; i < 8; i++) {
 741         felem_square(tmp, ftmp3);
 742         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 743     }
 744     felem_mul(tmp, ftmp3, ftmp2);
 745     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 746     felem_assign(ftmp2, ftmp3);
 747
 748     for (i = 0; i < 16; i++) {
 749         felem_square(tmp, ftmp3);
 750         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 751     }
 752     felem_mul(tmp, ftmp3, ftmp2);
 753     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 754     felem_assign(ftmp2, ftmp3);
 755
 756     for (i = 0; i < 32; i++) {
 757         felem_square(tmp, ftmp3);
 758         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 759     }
 760     felem_mul(tmp, ftmp3, ftmp2);
 761     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 762     felem_assign(ftmp2, ftmp3);
 763
 764     for (i = 0; i < 64; i++) {
 765         felem_square(tmp, ftmp3);
 766         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 767     }
 768     felem_mul(tmp, ftmp3, ftmp2);
 769     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 770     felem_assign(ftmp2, ftmp3);
 771
 772     for (i = 0; i < 128; i++) {
 773         felem_square(tmp, ftmp3);
 774         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 775     }
 776     felem_mul(tmp, ftmp3, ftmp2);
 777     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 778     felem_assign(ftmp2, ftmp3);
 779
 780     for (i = 0; i < 256; i++) {
 781         felem_square(tmp, ftmp3);
 782         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 783     }
 784     felem_mul(tmp, ftmp3, ftmp2);
 785     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 786
 787     for (i = 0; i < 9; i++) {
 788         felem_square(tmp, ftmp3);
 789         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 790     }
 791     felem_mul(tmp, ftmp3, ftmp4);
 792     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 793     felem_mul(tmp, ftmp3, in);
 794     felem_reduce(out, tmp);     /* 2^512 - 3 */
 795 }
 796
 797 /* This is 2^521-1, expressed as an felem */
 798 static const felem kPrime = {
 799     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 800     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 801     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 802 };
 803
 804 /*-
 805  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 806  * otherwise.
 807  * On entry:
 808  *   in[i] < 2^59 + 2^14
 809  */
 810 static limb felem_is_zero(const felem in)
 811 {
 812     felem ftmp;
 813     limb is_zero, is_p;
 814     felem_assign(ftmp, in);
 815
 816     ftmp[0] += ftmp[8] >> 57;
 817     ftmp[8] &= bottom57bits;
 818     /* ftmp[8] < 2^57 */
 819     ftmp[1] += ftmp[0] >> 58;
 820     ftmp[0] &= bottom58bits;
 821     ftmp[2] += ftmp[1] >> 58;
 822     ftmp[1] &= bottom58bits;
 823     ftmp[3] += ftmp[2] >> 58;
 824     ftmp[2] &= bottom58bits;
 825     ftmp[4] += ftmp[3] >> 58;
 826     ftmp[3] &= bottom58bits;
 827     ftmp[5] += ftmp[4] >> 58;
 828     ftmp[4] &= bottom58bits;
 829     ftmp[6] += ftmp[5] >> 58;
 830     ftmp[5] &= bottom58bits;
 831     ftmp[7] += ftmp[6] >> 58;
 832     ftmp[6] &= bottom58bits;
 833     ftmp[8] += ftmp[7] >> 58;
 834     ftmp[7] &= bottom58bits;
 835     /* ftmp[8] < 2^57 + 4 */
 836
 837     /*
 838      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 839      * than our bound for ftmp[8]. Therefore we only have to check if the
 840      * zero is zero or 2^521-1.
 841      */
 842
 843     is_zero = 0;
 844     is_zero |= ftmp[0];
 845     is_zero |= ftmp[1];
 846     is_zero |= ftmp[2];
 847     is_zero |= ftmp[3];
 848     is_zero |= ftmp[4];
 849     is_zero |= ftmp[5];
 850     is_zero |= ftmp[6];
 851     is_zero |= ftmp[7];
 852     is_zero |= ftmp[8];
 853
 854     is_zero--;
 855     /*
 856      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 857      * can be set is if is_zero was 0 before the decrement.
 858      */
 859     is_zero = 0 - (is_zero >> 63);
 860
 861     is_p = ftmp[0] ^ kPrime[0];
 862     is_p |= ftmp[1] ^ kPrime[1];
 863     is_p |= ftmp[2] ^ kPrime[2];
 864     is_p |= ftmp[3] ^ kPrime[3];
 865     is_p |= ftmp[4] ^ kPrime[4];
 866     is_p |= ftmp[5] ^ kPrime[5];
 867     is_p |= ftmp[6] ^ kPrime[6];
 868     is_p |= ftmp[7] ^ kPrime[7];
 869     is_p |= ftmp[8] ^ kPrime[8];
 870
 871     is_p--;
 872     is_p = 0 - (is_p >> 63);
 873
 874     is_zero |= is_p;
 875     return is_zero;
 876 }
 877
 878 static int felem_is_zero_int(const void *in)
 879 {
 880     return (int)(felem_is_zero(in) & ((limb) 1));
 881 }
 882
 883 /*-
 884  * felem_contract converts |in| to its unique, minimal representation.
 885  * On entry:
 886  *   in[i] < 2^59 + 2^14
 887  */
 888 static void felem_contract(felem out, const felem in)
 889 {
 890     limb is_p, is_greater, sign;
 891     static const limb two58 = ((limb) 1) << 58;
 892
 893     felem_assign(out, in);
 894
 895     out[0] += out[8] >> 57;
 896     out[8] &= bottom57bits;
 897     /* out[8] < 2^57 */
 898     out[1] += out[0] >> 58;
 899     out[0] &= bottom58bits;
 900     out[2] += out[1] >> 58;
 901     out[1] &= bottom58bits;
 902     out[3] += out[2] >> 58;
 903     out[2] &= bottom58bits;
 904     out[4] += out[3] >> 58;
 905     out[3] &= bottom58bits;
 906     out[5] += out[4] >> 58;
 907     out[4] &= bottom58bits;
 908     out[6] += out[5] >> 58;
 909     out[5] &= bottom58bits;
 910     out[7] += out[6] >> 58;
 911     out[6] &= bottom58bits;
 912     out[8] += out[7] >> 58;
 913     out[7] &= bottom58bits;
 914     /* out[8] < 2^57 + 4 */
 915
 916     /*
 917      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 918      * out. See the comments in felem_is_zero regarding why we don't test for
 919      * other multiples of the prime.
 920      */
 921
 922     /*
 923      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 924      */
 925
 926     is_p = out[0] ^ kPrime[0];
 927     is_p |= out[1] ^ kPrime[1];
 928     is_p |= out[2] ^ kPrime[2];
 929     is_p |= out[3] ^ kPrime[3];
 930     is_p |= out[4] ^ kPrime[4];
 931     is_p |= out[5] ^ kPrime[5];
 932     is_p |= out[6] ^ kPrime[6];
 933     is_p |= out[7] ^ kPrime[7];
 934     is_p |= out[8] ^ kPrime[8];
 935
 936     is_p--;
 937     is_p &= is_p << 32;
 938     is_p &= is_p << 16;
 939     is_p &= is_p << 8;
 940     is_p &= is_p << 4;
 941     is_p &= is_p << 2;
 942     is_p &= is_p << 1;
 943     is_p = 0 - (is_p >> 63);
 944     is_p = ~is_p;
 945
 946     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 947
 948     out[0] &= is_p;
 949     out[1] &= is_p;
 950     out[2] &= is_p;
 951     out[3] &= is_p;
 952     out[4] &= is_p;
 953     out[5] &= is_p;
 954     out[6] &= is_p;
 955     out[7] &= is_p;
 956     out[8] &= is_p;
 957
 958     /*
 959      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 960      * 57 is greater than zero as (2^521-1) + x >= 2^522
 961      */
 962     is_greater = out[8] >> 57;
 963     is_greater |= is_greater << 32;
 964     is_greater |= is_greater << 16;
 965     is_greater |= is_greater << 8;
 966     is_greater |= is_greater << 4;
 967     is_greater |= is_greater << 2;
 968     is_greater |= is_greater << 1;
 969     is_greater = 0 - (is_greater >> 63);
 970
 971     out[0] -= kPrime[0] & is_greater;
 972     out[1] -= kPrime[1] & is_greater;
 973     out[2] -= kPrime[2] & is_greater;
 974     out[3] -= kPrime[3] & is_greater;
 975     out[4] -= kPrime[4] & is_greater;
 976     out[5] -= kPrime[5] & is_greater;
 977     out[6] -= kPrime[6] & is_greater;
 978     out[7] -= kPrime[7] & is_greater;
 979     out[8] -= kPrime[8] & is_greater;
 980
 981     /* Eliminate negative coefficients */
 982     sign = -(out[0] >> 63);
 983     out[0] += (two58 & sign);
 984     out[1] -= (1 & sign);
 985     sign = -(out[1] >> 63);
 986     out[1] += (two58 & sign);
 987     out[2] -= (1 & sign);
 988     sign = -(out[2] >> 63);
 989     out[2] += (two58 & sign);
 990     out[3] -= (1 & sign);
 991     sign = -(out[3] >> 63);
 992     out[3] += (two58 & sign);
 993     out[4] -= (1 & sign);
 994     sign = -(out[4] >> 63);
 995     out[4] += (two58 & sign);
 996     out[5] -= (1 & sign);
 997     sign = -(out[0] >> 63);
 998     out[5] += (two58 & sign);
 999     out[6] -= (1 & sign);
1000     sign = -(out[6] >> 63);
1001     out[6] += (two58 & sign);
1002     out[7] -= (1 & sign);
1003     sign = -(out[7] >> 63);
1004     out[7] += (two58 & sign);
1005     out[8] -= (1 & sign);
1006     sign = -(out[5] >> 63);
1007     out[5] += (two58 & sign);
1008     out[6] -= (1 & sign);
1009     sign = -(out[6] >> 63);
1010     out[6] += (two58 & sign);
1011     out[7] -= (1 & sign);
1012     sign = -(out[7] >> 63);
1013     out[7] += (two58 & sign);
1014     out[8] -= (1 & sign);
1015 }
1016
1017 /*-
1018  * Group operations
1019  * ----------------
1020  *
1021  * Building on top of the field operations we have the operations on the
1022  * elliptic curve group itself. Points on the curve are represented in Jacobian
1023  * coordinates */
1024
1025 /*-
1026  * point_double calcuates 2*(x_in, y_in, z_in)
1027  *
1028  * The method is taken from:
1029  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1030  *
1031  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1032  * while x_out == y_in is not (maybe this works, but it's not tested). */
1033 static void
1034 point_double(felem x_out, felem y_out, felem z_out,
1035              const felem x_in, const felem y_in, const felem z_in)
1036 {
1037     largefelem tmp, tmp2;
1038     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1039
1040     felem_assign(ftmp, x_in);
1041     felem_assign(ftmp2, x_in);
1042
1043     /* delta = z^2 */
1044     felem_square(tmp, z_in);
1045     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1046
1047     /* gamma = y^2 */
1048     felem_square(tmp, y_in);
1049     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1050
1051     /* beta = x*gamma */
1052     felem_mul(tmp, x_in, gamma);
1053     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1054
1055     /* alpha = 3*(x-delta)*(x+delta) */
1056     felem_diff64(ftmp, delta);
1057     /* ftmp[i] < 2^61 */
1058     felem_sum64(ftmp2, delta);
1059     /* ftmp2[i] < 2^60 + 2^15 */
1060     felem_scalar64(ftmp2, 3);
1061     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1062     felem_mul(tmp, ftmp, ftmp2);
1063     /*-
1064      * tmp[i] < 17(3*2^121 + 3*2^76)
1065      *        = 61*2^121 + 61*2^76
1066      *        < 64*2^121 + 64*2^76
1067      *        = 2^127 + 2^82
1068      *        < 2^128
1069      */
1070     felem_reduce(alpha, tmp);
1071
1072     /* x' = alpha^2 - 8*beta */
1073     felem_square(tmp, alpha);
1074     /*
1075      * tmp[i] < 17*2^120 < 2^125
1076      */
1077     felem_assign(ftmp, beta);
1078     felem_scalar64(ftmp, 8);
1079     /* ftmp[i] < 2^62 + 2^17 */
1080     felem_diff_128_64(tmp, ftmp);
1081     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1082     felem_reduce(x_out, tmp);
1083
1084     /* z' = (y + z)^2 - gamma - delta */
1085     felem_sum64(delta, gamma);
1086     /* delta[i] < 2^60 + 2^15 */
1087     felem_assign(ftmp, y_in);
1088     felem_sum64(ftmp, z_in);
1089     /* ftmp[i] < 2^60 + 2^15 */
1090     felem_square(tmp, ftmp);
1091     /*
1092      * tmp[i] < 17(2^122) < 2^127
1093      */
1094     felem_diff_128_64(tmp, delta);
1095     /* tmp[i] < 2^127 + 2^63 */
1096     felem_reduce(z_out, tmp);
1097
1098     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1099     felem_scalar64(beta, 4);
1100     /* beta[i] < 2^61 + 2^16 */
1101     felem_diff64(beta, x_out);
1102     /* beta[i] < 2^61 + 2^60 + 2^16 */
1103     felem_mul(tmp, alpha, beta);
1104     /*-
1105      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1106      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1107      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1108      *        < 2^128
1109      */
1110     felem_square(tmp2, gamma);
1111     /*-
1112      * tmp2[i] < 17*(2^59 + 2^14)^2
1113      *         = 17*(2^118 + 2^74 + 2^28)
1114      */
1115     felem_scalar128(tmp2, 8);
1116     /*-
1117      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1118      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1119      *         < 2^126
1120      */
1121     felem_diff128(tmp, tmp2);
1122     /*-
1123      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1124      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1125      *          2^74 + 2^69 + 2^34 + 2^30
1126      *        < 2^128
1127      */
1128     felem_reduce(y_out, tmp);
1129 }
1130
1131 /* copy_conditional copies in to out iff mask is all ones. */
1132 static void copy_conditional(felem out, const felem in, limb mask)
1133 {
1134     unsigned i;
1135     for (i = 0; i < NLIMBS; ++i) {
1136         const limb tmp = mask & (in[i] ^ out[i]);
1137         out[i] ^= tmp;
1138     }
1139 }
1140
1141 /*-
1142  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1143  *
1144  * The method is taken from
1145  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1146  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1147  *
1148  * This function includes a branch for checking whether the two input points
1149  * are equal (while not equal to the point at infinity). This case never
1150  * happens during single point multiplication, so there is no timing leak for
1151  * ECDH or ECDSA signing. */
1152 static void point_add(felem x3, felem y3, felem z3,
1153                       const felem x1, const felem y1, const felem z1,
1154                       const int mixed, const felem x2, const felem y2,
1155                       const felem z2)
1156 {
1157     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1158     largefelem tmp, tmp2;
1159     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1160
1161     z1_is_zero = felem_is_zero(z1);
1162     z2_is_zero = felem_is_zero(z2);
1163
1164     /* ftmp = z1z1 = z1**2 */
1165     felem_square(tmp, z1);
1166     felem_reduce(ftmp, tmp);
1167
1168     if (!mixed) {
1169         /* ftmp2 = z2z2 = z2**2 */
1170         felem_square(tmp, z2);
1171         felem_reduce(ftmp2, tmp);
1172
1173         /* u1 = ftmp3 = x1*z2z2 */
1174         felem_mul(tmp, x1, ftmp2);
1175         felem_reduce(ftmp3, tmp);
1176
1177         /* ftmp5 = z1 + z2 */
1178         felem_assign(ftmp5, z1);
1179         felem_sum64(ftmp5, z2);
1180         /* ftmp5[i] < 2^61 */
1181
1182         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1183         felem_square(tmp, ftmp5);
1184         /* tmp[i] < 17*2^122 */
1185         felem_diff_128_64(tmp, ftmp);
1186         /* tmp[i] < 17*2^122 + 2^63 */
1187         felem_diff_128_64(tmp, ftmp2);
1188         /* tmp[i] < 17*2^122 + 2^64 */
1189         felem_reduce(ftmp5, tmp);
1190
1191         /* ftmp2 = z2 * z2z2 */
1192         felem_mul(tmp, ftmp2, z2);
1193         felem_reduce(ftmp2, tmp);
1194
1195         /* s1 = ftmp6 = y1 * z2**3 */
1196         felem_mul(tmp, y1, ftmp2);
1197         felem_reduce(ftmp6, tmp);
1198     } else {
1199         /*
1200          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1201          */
1202
1203         /* u1 = ftmp3 = x1*z2z2 */
1204         felem_assign(ftmp3, x1);
1205
1206         /* ftmp5 = 2*z1z2 */
1207         felem_scalar(ftmp5, z1, 2);
1208
1209         /* s1 = ftmp6 = y1 * z2**3 */
1210         felem_assign(ftmp6, y1);
1211     }
1212
1213     /* u2 = x2*z1z1 */
1214     felem_mul(tmp, x2, ftmp);
1215     /* tmp[i] < 17*2^120 */
1216
1217     /* h = ftmp4 = u2 - u1 */
1218     felem_diff_128_64(tmp, ftmp3);
1219     /* tmp[i] < 17*2^120 + 2^63 */
1220     felem_reduce(ftmp4, tmp);
1221
1222     x_equal = felem_is_zero(ftmp4);
1223
1224     /* z_out = ftmp5 * h */
1225     felem_mul(tmp, ftmp5, ftmp4);
1226     felem_reduce(z_out, tmp);
1227
1228     /* ftmp = z1 * z1z1 */
1229     felem_mul(tmp, ftmp, z1);
1230     felem_reduce(ftmp, tmp);
1231
1232     /* s2 = tmp = y2 * z1**3 */
1233     felem_mul(tmp, y2, ftmp);
1234     /* tmp[i] < 17*2^120 */
1235
1236     /* r = ftmp5 = (s2 - s1)*2 */
1237     felem_diff_128_64(tmp, ftmp6);
1238     /* tmp[i] < 17*2^120 + 2^63 */
1239     felem_reduce(ftmp5, tmp);
1240     y_equal = felem_is_zero(ftmp5);
1241     felem_scalar64(ftmp5, 2);
1242     /* ftmp5[i] < 2^61 */
1243
1244     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1245         point_double(x3, y3, z3, x1, y1, z1);
1246         return;
1247     }
1248
1249     /* I = ftmp = (2h)**2 */
1250     felem_assign(ftmp, ftmp4);
1251     felem_scalar64(ftmp, 2);
1252     /* ftmp[i] < 2^61 */
1253     felem_square(tmp, ftmp);
1254     /* tmp[i] < 17*2^122 */
1255     felem_reduce(ftmp, tmp);
1256
1257     /* J = ftmp2 = h * I */
1258     felem_mul(tmp, ftmp4, ftmp);
1259     felem_reduce(ftmp2, tmp);
1260
1261     /* V = ftmp4 = U1 * I */
1262     felem_mul(tmp, ftmp3, ftmp);
1263     felem_reduce(ftmp4, tmp);
1264
1265     /* x_out = r**2 - J - 2V */
1266     felem_square(tmp, ftmp5);
1267     /* tmp[i] < 17*2^122 */
1268     felem_diff_128_64(tmp, ftmp2);
1269     /* tmp[i] < 17*2^122 + 2^63 */
1270     felem_assign(ftmp3, ftmp4);
1271     felem_scalar64(ftmp4, 2);
1272     /* ftmp4[i] < 2^61 */
1273     felem_diff_128_64(tmp, ftmp4);
1274     /* tmp[i] < 17*2^122 + 2^64 */
1275     felem_reduce(x_out, tmp);
1276
1277     /* y_out = r(V-x_out) - 2 * s1 * J */
1278     felem_diff64(ftmp3, x_out);
1279     /*
1280      * ftmp3[i] < 2^60 + 2^60 = 2^61
1281      */
1282     felem_mul(tmp, ftmp5, ftmp3);
1283     /* tmp[i] < 17*2^122 */
1284     felem_mul(tmp2, ftmp6, ftmp2);
1285     /* tmp2[i] < 17*2^120 */
1286     felem_scalar128(tmp2, 2);
1287     /* tmp2[i] < 17*2^121 */
1288     felem_diff128(tmp, tmp2);
1289         /*-
1290          * tmp[i] < 2^127 - 2^69 + 17*2^122
1291          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1292          *        < 2^127
1293          */
1294     felem_reduce(y_out, tmp);
1295
1296     copy_conditional(x_out, x2, z1_is_zero);
1297     copy_conditional(x_out, x1, z2_is_zero);
1298     copy_conditional(y_out, y2, z1_is_zero);
1299     copy_conditional(y_out, y1, z2_is_zero);
1300     copy_conditional(z_out, z2, z1_is_zero);
1301     copy_conditional(z_out, z1, z2_is_zero);
1302     felem_assign(x3, x_out);
1303     felem_assign(y3, y_out);
1304     felem_assign(z3, z_out);
1305 }
1306
1307 /*-
1308  * Base point pre computation
1309  * --------------------------
1310  *
1311  * Two different sorts of precomputed tables are used in the following code.
1312  * Each contain various points on the curve, where each point is three field
1313  * elements (x, y, z).
1314  *
1315  * For the base point table, z is usually 1 (0 for the point at infinity).
1316  * This table has 16 elements:
1317  * index | bits    | point
1318  * ------+---------+------------------------------
1319  *     0 | 0 0 0 0 | 0G
1320  *     1 | 0 0 0 1 | 1G
1321  *     2 | 0 0 1 0 | 2^130G
1322  *     3 | 0 0 1 1 | (2^130 + 1)G
1323  *     4 | 0 1 0 0 | 2^260G
1324  *     5 | 0 1 0 1 | (2^260 + 1)G
1325  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1326  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1327  *     8 | 1 0 0 0 | 2^390G
1328  *     9 | 1 0 0 1 | (2^390 + 1)G
1329  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1330  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1331  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1332  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1333  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1334  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1335  *
1336  * The reason for this is so that we can clock bits into four different
1337  * locations when doing simple scalar multiplies against the base point.
1338  *
1339  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1340
1341 /* gmul is the table of precomputed base points */
1342 static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1343                                     {0, 0, 0, 0, 0, 0, 0, 0, 0},
1344                                     {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1345 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1346   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1347   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1348  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1349   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1350   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1351  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1352 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1353   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1354   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1355  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1356   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1357   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1358  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1359 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1360   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1361   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1362  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1363   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1364   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1365  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1366 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1367   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1368   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1369  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1370   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1371   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1372  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1373 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1374   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1375   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1376  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1377   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1378   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1379  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1380 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1381   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1382   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1383  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1384   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1385   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1386  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1387 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1388   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1389   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1390  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1391   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1392   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1393  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1394 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1395   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1396   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1397  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1398   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1399   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1400  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1401 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1402   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1403   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1404  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1405   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1406   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1407  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1408 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1409   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1410   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1411  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1412   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1413   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1414  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1415 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1416   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1417   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1418  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1419   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1420   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1421  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1422 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1423   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1424   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1425  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1426   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1427   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1428  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1429 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1430   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1431   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1432  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1433   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1434   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1435  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1436 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1437   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1438   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1439  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1440   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1441   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1442  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1443 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1444   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1445   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1446  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1447   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1448   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1449  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1450 };
1451
1452 /*
1453  * select_point selects the |idx|th point from a precomputation table and
1454  * copies it to out.
1455  */
1456  /* pre_comp below is of the size provided in |size| */
1457 static void select_point(const limb idx, unsigned int size,
1458                          const felem pre_comp[][3], felem out[3])
1459 {
1460     unsigned i, j;
1461     limb *outlimbs = &out[0][0];
1462     memset(outlimbs, 0, 3 * sizeof(felem));
1463
1464     for (i = 0; i < size; i++) {
1465         const limb *inlimbs = &pre_comp[i][0][0];
1466         limb mask = i ^ idx;
1467         mask |= mask >> 4;
1468         mask |= mask >> 2;
1469         mask |= mask >> 1;
1470         mask &= 1;
1471         mask--;
1472         for (j = 0; j < NLIMBS * 3; j++)
1473             outlimbs[j] |= inlimbs[j] & mask;
1474     }
1475 }
1476
1477 /* get_bit returns the |i|th bit in |in| */
1478 static char get_bit(const felem_bytearray in, int i)
1479 {
1480     if (i < 0)
1481         return 0;
1482     return (in[i >> 3] >> (i & 7)) & 1;
1483 }
1484
1485 /*
1486  * Interleaved point multiplication using precomputed point multiples: The
1487  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1488  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1489  * generator, using certain (large) precomputed multiples in g_pre_comp.
1490  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1491  */
1492 static void batch_mul(felem x_out, felem y_out, felem z_out,
1493                       const felem_bytearray scalars[],
1494                       const unsigned num_points, const u8 *g_scalar,
1495                       const int mixed, const felem pre_comp[][17][3],
1496                       const felem g_pre_comp[16][3])
1497 {
1498     int i, skip;
1499     unsigned num, gen_mul = (g_scalar != NULL);
1500     felem nq[3], tmp[4];
1501     limb bits;
1502     u8 sign, digit;
1503
1504     /* set nq to the point at infinity */
1505     memset(nq, 0, 3 * sizeof(felem));
1506
1507     /*
1508      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1509      * of the generator (last quarter of rounds) and additions of other
1510      * points multiples (every 5th round).
1511      */
1512     skip = 1;                   /* save two point operations in the first
1513                                  * round */
1514     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1515         /* double */
1516         if (!skip)
1517             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1518
1519         /* add multiples of the generator */
1520         if (gen_mul && (i <= 130)) {
1521             bits = get_bit(g_scalar, i + 390) << 3;
1522             if (i < 130) {
1523                 bits |= get_bit(g_scalar, i + 260) << 2;
1524                 bits |= get_bit(g_scalar, i + 130) << 1;
1525                 bits |= get_bit(g_scalar, i);
1526             }
1527             /* select the point to add, in constant time */
1528             select_point(bits, 16, g_pre_comp, tmp);
1529             if (!skip) {
1530                 /* The 1 argument below is for "mixed" */
1531                 point_add(nq[0], nq[1], nq[2],
1532                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1533             } else {
1534                 memcpy(nq, tmp, 3 * sizeof(felem));
1535                 skip = 0;
1536             }
1537         }
1538
1539         /* do other additions every 5 doublings */
1540         if (num_points && (i % 5 == 0)) {
1541             /* loop over all scalars */
1542             for (num = 0; num < num_points; ++num) {
1543                 bits = get_bit(scalars[num], i + 4) << 5;
1544                 bits |= get_bit(scalars[num], i + 3) << 4;
1545                 bits |= get_bit(scalars[num], i + 2) << 3;
1546                 bits |= get_bit(scalars[num], i + 1) << 2;
1547                 bits |= get_bit(scalars[num], i) << 1;
1548                 bits |= get_bit(scalars[num], i - 1);
1549                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1550
1551                 /*
1552                  * select the point to add or subtract, in constant time
1553                  */
1554                 select_point(digit, 17, pre_comp[num], tmp);
1555                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1556                                             * point */
1557                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1558
1559                 if (!skip) {
1560                     point_add(nq[0], nq[1], nq[2],
1561                               nq[0], nq[1], nq[2],
1562                               mixed, tmp[0], tmp[1], tmp[2]);
1563                 } else {
1564                     memcpy(nq, tmp, 3 * sizeof(felem));
1565                     skip = 0;
1566                 }
1567             }
1568         }
1569     }
1570     felem_assign(x_out, nq[0]);
1571     felem_assign(y_out, nq[1]);
1572     felem_assign(z_out, nq[2]);
1573 }
1574
1575 /* Precomputation for the group generator. */
1576 typedef struct {
1577     felem g_pre_comp[16][3];
1578     int references;
1579 } NISTP521_PRE_COMP;
1580
1581 const EC_METHOD *EC_GFp_nistp521_method(void)
1582 {
1583     static const EC_METHOD ret = {
1584         EC_FLAGS_DEFAULT_OCT,
1585         NID_X9_62_prime_field,
1586         ec_GFp_nistp521_group_init,
1587         ec_GFp_simple_group_finish,
1588         ec_GFp_simple_group_clear_finish,
1589         ec_GFp_nist_group_copy,
1590         ec_GFp_nistp521_group_set_curve,
1591         ec_GFp_simple_group_get_curve,
1592         ec_GFp_simple_group_get_degree,
1593         ec_GFp_simple_group_check_discriminant,
1594         ec_GFp_simple_point_init,
1595         ec_GFp_simple_point_finish,
1596         ec_GFp_simple_point_clear_finish,
1597         ec_GFp_simple_point_copy,
1598         ec_GFp_simple_point_set_to_infinity,
1599         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1600         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1601         ec_GFp_simple_point_set_affine_coordinates,
1602         ec_GFp_nistp521_point_get_affine_coordinates,
1603         0 /* point_set_compressed_coordinates */ ,
1604         0 /* point2oct */ ,
1605         0 /* oct2point */ ,
1606         ec_GFp_simple_add,
1607         ec_GFp_simple_dbl,
1608         ec_GFp_simple_invert,
1609         ec_GFp_simple_is_at_infinity,
1610         ec_GFp_simple_is_on_curve,
1611         ec_GFp_simple_cmp,
1612         ec_GFp_simple_make_affine,
1613         ec_GFp_simple_points_make_affine,
1614         ec_GFp_nistp521_points_mul,
1615         ec_GFp_nistp521_precompute_mult,
1616         ec_GFp_nistp521_have_precompute_mult,
1617         ec_GFp_nist_field_mul,
1618         ec_GFp_nist_field_sqr,
1619         0 /* field_div */ ,
1620         0 /* field_encode */ ,
1621         0 /* field_decode */ ,
1622         0                       /* field_set_to_one */
1623     };
1624
1625     return &ret;
1626 }
1627
1628 /******************************************************************************/
1629 /*
1630  * FUNCTIONS TO MANAGE PRECOMPUTATION
1631  */
1632
1633 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1634 {
1635     NISTP521_PRE_COMP *ret = NULL;
1636     ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1637     if (!ret) {
1638         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1639         return ret;
1640     }
1641     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1642     ret->references = 1;
1643     return ret;
1644 }
1645
1646 static void *nistp521_pre_comp_dup(void *src_)
1647 {
1648     NISTP521_PRE_COMP *src = src_;
1649
1650     /* no need to actually copy, these objects never change! */
1651     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1652
1653     return src_;
1654 }
1655
1656 static void nistp521_pre_comp_free(void *pre_)
1657 {
1658     int i;
1659     NISTP521_PRE_COMP *pre = pre_;
1660
1661     if (!pre)
1662         return;
1663
1664     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1665     if (i > 0)
1666         return;
1667
1668     OPENSSL_free(pre);
1669 }
1670
1671 static void nistp521_pre_comp_clear_free(void *pre_)
1672 {
1673     int i;
1674     NISTP521_PRE_COMP *pre = pre_;
1675
1676     if (!pre)
1677         return;
1678
1679     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1680     if (i > 0)
1681         return;
1682
1683     OPENSSL_cleanse(pre, sizeof(*pre));
1684     OPENSSL_free(pre);
1685 }
1686
1687 /******************************************************************************/
1688 /*
1689  * OPENSSL EC_METHOD FUNCTIONS
1690  */
1691
1692 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1693 {
1694     int ret;
1695     ret = ec_GFp_simple_group_init(group);
1696     group->a_is_minus3 = 1;
1697     return ret;
1698 }
1699
1700 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1701                                     const BIGNUM *a, const BIGNUM *b,
1702                                     BN_CTX *ctx)
1703 {
1704     int ret = 0;
1705     BN_CTX *new_ctx = NULL;
1706     BIGNUM *curve_p, *curve_a, *curve_b;
1707
1708     if (ctx == NULL)
1709         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1710             return 0;
1711     BN_CTX_start(ctx);
1712     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1713         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1714         ((curve_b = BN_CTX_get(ctx)) == NULL))
1715         goto err;
1716     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1717     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1718     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1719     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1720         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1721               EC_R_WRONG_CURVE_PARAMETERS);
1722         goto err;
1723     }
1724     group->field_mod_func = BN_nist_mod_521;
1725     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1726  err:
1727     BN_CTX_end(ctx);
1728     if (new_ctx != NULL)
1729         BN_CTX_free(new_ctx);
1730     return ret;
1731 }
1732
1733 /*
1734  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1735  * (X/Z^2, Y/Z^3)
1736  */
1737 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1738                                                  const EC_POINT *point,
1739                                                  BIGNUM *x, BIGNUM *y,
1740                                                  BN_CTX *ctx)
1741 {
1742     felem z1, z2, x_in, y_in, x_out, y_out;
1743     largefelem tmp;
1744
1745     if (EC_POINT_is_at_infinity(group, point)) {
1746         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1747               EC_R_POINT_AT_INFINITY);
1748         return 0;
1749     }
1750     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1751         (!BN_to_felem(z1, &point->Z)))
1752         return 0;
1753     felem_inv(z2, z1);
1754     felem_square(tmp, z2);
1755     felem_reduce(z1, tmp);
1756     felem_mul(tmp, x_in, z1);
1757     felem_reduce(x_in, tmp);
1758     felem_contract(x_out, x_in);
1759     if (x != NULL) {
1760         if (!felem_to_BN(x, x_out)) {
1761             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1762                   ERR_R_BN_LIB);
1763             return 0;
1764         }
1765     }
1766     felem_mul(tmp, z1, z2);
1767     felem_reduce(z1, tmp);
1768     felem_mul(tmp, y_in, z1);
1769     felem_reduce(y_in, tmp);
1770     felem_contract(y_out, y_in);
1771     if (y != NULL) {
1772         if (!felem_to_BN(y, y_out)) {
1773             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1774                   ERR_R_BN_LIB);
1775             return 0;
1776         }
1777     }
1778     return 1;
1779 }
1780
1781 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1782 static void make_points_affine(size_t num, felem points[][3],
1783                                felem tmp_felems[])
1784 {
1785     /*
1786      * Runs in constant time, unless an input is the point at infinity (which
1787      * normally shouldn't happen).
1788      */
1789     ec_GFp_nistp_points_make_affine_internal(num,
1790                                              points,
1791                                              sizeof(felem),
1792                                              tmp_felems,
1793                                              (void (*)(void *))felem_one,
1794                                              felem_is_zero_int,
1795                                              (void (*)(void *, const void *))
1796                                              felem_assign,
1797                                              (void (*)(void *, const void *))
1798                                              felem_square_reduce, (void (*)
1799                                                                    (void *,
1800                                                                     const void
1801                                                                     *,
1802                                                                     const void
1803                                                                     *))
1804                                              felem_mul_reduce,
1805                                              (void (*)(void *, const void *))
1806                                              felem_inv,
1807                                              (void (*)(void *, const void *))
1808                                              felem_contract);
1809 }
1810
1811 /*
1812  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1813  * values Result is stored in r (r can equal one of the inputs).
1814  */
1815 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1816                                const BIGNUM *scalar, size_t num,
1817                                const EC_POINT *points[],
1818                                const BIGNUM *scalars[], BN_CTX *ctx)
1819 {
1820     int ret = 0;
1821     int j;
1822     int mixed = 0;
1823     BN_CTX *new_ctx = NULL;
1824     BIGNUM *x, *y, *z, *tmp_scalar;
1825     felem_bytearray g_secret;
1826     felem_bytearray *secrets = NULL;
1827     felem(*pre_comp)[17][3] = NULL;
1828     felem *tmp_felems = NULL;
1829     felem_bytearray tmp;
1830     unsigned i, num_bytes;
1831     int have_pre_comp = 0;
1832     size_t num_points = num;
1833     felem x_in, y_in, z_in, x_out, y_out, z_out;
1834     NISTP521_PRE_COMP *pre = NULL;
1835     felem(*g_pre_comp)[3] = NULL;
1836     EC_POINT *generator = NULL;
1837     const EC_POINT *p = NULL;
1838     const BIGNUM *p_scalar = NULL;
1839
1840     if (ctx == NULL)
1841         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1842             return 0;
1843     BN_CTX_start(ctx);
1844     if (((x = BN_CTX_get(ctx)) == NULL) ||
1845         ((y = BN_CTX_get(ctx)) == NULL) ||
1846         ((z = BN_CTX_get(ctx)) == NULL) ||
1847         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1848         goto err;
1849
1850     if (scalar != NULL) {
1851         pre = EC_EX_DATA_get_data(group->extra_data,
1852                                   nistp521_pre_comp_dup,
1853                                   nistp521_pre_comp_free,
1854                                   nistp521_pre_comp_clear_free);
1855         if (pre)
1856             /* we have precomputation, try to use it */
1857             g_pre_comp = &pre->g_pre_comp[0];
1858         else
1859             /* try to use the standard precomputation */
1860             g_pre_comp = (felem(*)[3]) gmul;
1861         generator = EC_POINT_new(group);
1862         if (generator == NULL)
1863             goto err;
1864         /* get the generator from precomputation */
1865         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1866             !felem_to_BN(y, g_pre_comp[1][1]) ||
1867             !felem_to_BN(z, g_pre_comp[1][2])) {
1868             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1869             goto err;
1870         }
1871         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1872                                                       generator, x, y, z,
1873                                                       ctx))
1874             goto err;
1875         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1876             /* precomputation matches generator */
1877             have_pre_comp = 1;
1878         else
1879             /*
1880              * we don't have valid precomputation: treat the generator as a
1881              * random point
1882              */
1883             num_points++;
1884     }
1885
1886     if (num_points > 0) {
1887         if (num_points >= 2) {
1888             /*
1889              * unless we precompute multiples for just one point, converting
1890              * those into affine form is time well spent
1891              */
1892             mixed = 1;
1893         }
1894         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1895         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1896         if (mixed)
1897             tmp_felems =
1898                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1899         if ((secrets == NULL) || (pre_comp == NULL)
1900             || (mixed && (tmp_felems == NULL))) {
1901             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1902             goto err;
1903         }
1904
1905         /*
1906          * we treat NULL scalars as 0, and NULL points as points at infinity,
1907          * i.e., they contribute nothing to the linear combination
1908          */
1909         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1910         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1911         for (i = 0; i < num_points; ++i) {
1912             if (i == num)
1913                 /*
1914                  * we didn't have a valid precomputation, so we pick the
1915                  * generator
1916                  */
1917             {
1918                 p = EC_GROUP_get0_generator(group);
1919                 p_scalar = scalar;
1920             } else
1921                 /* the i^th point */
1922             {
1923                 p = points[i];
1924                 p_scalar = scalars[i];
1925             }
1926             if ((p_scalar != NULL) && (p != NULL)) {
1927                 /* reduce scalar to 0 <= scalar < 2^521 */
1928                 if ((BN_num_bits(p_scalar) > 521)
1929                     || (BN_is_negative(p_scalar))) {
1930                     /*
1931                      * this is an unusual input, and we don't guarantee
1932                      * constant-timeness
1933                      */
1934                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1935                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1936                         goto err;
1937                     }
1938                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1939                 } else
1940                     num_bytes = BN_bn2bin(p_scalar, tmp);
1941                 flip_endian(secrets[i], tmp, num_bytes);
1942                 /* precompute multiples */
1943                 if ((!BN_to_felem(x_out, &p->X)) ||
1944                     (!BN_to_felem(y_out, &p->Y)) ||
1945                     (!BN_to_felem(z_out, &p->Z)))
1946                     goto err;
1947                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1948                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1949                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1950                 for (j = 2; j <= 16; ++j) {
1951                     if (j & 1) {
1952                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1953                                   pre_comp[i][j][2], pre_comp[i][1][0],
1954                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1955                                   pre_comp[i][j - 1][0],
1956                                   pre_comp[i][j - 1][1],
1957                                   pre_comp[i][j - 1][2]);
1958                     } else {
1959                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1960                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1961                                      pre_comp[i][j / 2][1],
1962                                      pre_comp[i][j / 2][2]);
1963                     }
1964                 }
1965             }
1966         }
1967         if (mixed)
1968             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1969     }
1970
1971     /* the scalar for the generator */
1972     if ((scalar != NULL) && (have_pre_comp)) {
1973         memset(g_secret, 0, sizeof(g_secret));
1974         /* reduce scalar to 0 <= scalar < 2^521 */
1975         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1976             /*
1977              * this is an unusual input, and we don't guarantee
1978              * constant-timeness
1979              */
1980             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1981                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1982                 goto err;
1983             }
1984             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1985         } else
1986             num_bytes = BN_bn2bin(scalar, tmp);
1987         flip_endian(g_secret, tmp, num_bytes);
1988         /* do the multiplication with generator precomputation */
1989         batch_mul(x_out, y_out, z_out,
1990                   (const felem_bytearray(*))secrets, num_points,
1991                   g_secret,
1992                   mixed, (const felem(*)[17][3])pre_comp,
1993                   (const felem(*)[3])g_pre_comp);
1994     } else
1995         /* do the multiplication without generator precomputation */
1996         batch_mul(x_out, y_out, z_out,
1997                   (const felem_bytearray(*))secrets, num_points,
1998                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1999     /* reduce the output to its unique minimal representation */
2000     felem_contract(x_in, x_out);
2001     felem_contract(y_in, y_out);
2002     felem_contract(z_in, z_out);
2003     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2004         (!felem_to_BN(z, z_in))) {
2005         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2006         goto err;
2007     }
2008     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2009
2010  err:
2011     BN_CTX_end(ctx);
2012     if (generator != NULL)
2013         EC_POINT_free(generator);
2014     if (new_ctx != NULL)
2015         BN_CTX_free(new_ctx);
2016     if (secrets != NULL)
2017         OPENSSL_free(secrets);
2018     if (pre_comp != NULL)
2019         OPENSSL_free(pre_comp);
2020     if (tmp_felems != NULL)
2021         OPENSSL_free(tmp_felems);
2022     return ret;
2023 }
2024
2025 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2026 {
2027     int ret = 0;
2028     NISTP521_PRE_COMP *pre = NULL;
2029     int i, j;
2030     BN_CTX *new_ctx = NULL;
2031     BIGNUM *x, *y;
2032     EC_POINT *generator = NULL;
2033     felem tmp_felems[16];
2034
2035     /* throw away old precomputation */
2036     EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2037                          nistp521_pre_comp_free,
2038                          nistp521_pre_comp_clear_free);
2039     if (ctx == NULL)
2040         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2041             return 0;
2042     BN_CTX_start(ctx);
2043     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2044         goto err;
2045     /* get the generator */
2046     if (group->generator == NULL)
2047         goto err;
2048     generator = EC_POINT_new(group);
2049     if (generator == NULL)
2050         goto err;
2051     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2052     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2053     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2054         goto err;
2055     if ((pre = nistp521_pre_comp_new()) == NULL)
2056         goto err;
2057     /*
2058      * if the generator is the standard one, use built-in precomputation
2059      */
2060     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2061         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2062         goto done;
2063     }
2064     if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2065         (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2066         (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2067         goto err;
2068     /* compute 2^130*G, 2^260*G, 2^390*G */
2069     for (i = 1; i <= 4; i <<= 1) {
2070         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2071                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2072                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2073         for (j = 0; j < 129; ++j) {
2074             point_double(pre->g_pre_comp[2 * i][0],
2075                          pre->g_pre_comp[2 * i][1],
2076                          pre->g_pre_comp[2 * i][2],
2077                          pre->g_pre_comp[2 * i][0],
2078                          pre->g_pre_comp[2 * i][1],
2079                          pre->g_pre_comp[2 * i][2]);
2080         }
2081     }
2082     /* g_pre_comp[0] is the point at infinity */
2083     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2084     /* the remaining multiples */
2085     /* 2^130*G + 2^260*G */
2086     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2087               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2088               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2089               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2090               pre->g_pre_comp[2][2]);
2091     /* 2^130*G + 2^390*G */
2092     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2093               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2094               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2095               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2096               pre->g_pre_comp[2][2]);
2097     /* 2^260*G + 2^390*G */
2098     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2099               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2100               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2101               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2102               pre->g_pre_comp[4][2]);
2103     /* 2^130*G + 2^260*G + 2^390*G */
2104     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2105               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2106               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2107               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2108               pre->g_pre_comp[2][2]);
2109     for (i = 1; i < 8; ++i) {
2110         /* odd multiples: add G */
2111         point_add(pre->g_pre_comp[2 * i + 1][0],
2112                   pre->g_pre_comp[2 * i + 1][1],
2113                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2114                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2115                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2116                   pre->g_pre_comp[1][2]);
2117     }
2118     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2119
2120  done:
2121     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2122                              nistp521_pre_comp_free,
2123                              nistp521_pre_comp_clear_free))
2124         goto err;
2125     ret = 1;
2126     pre = NULL;
2127  err:
2128     BN_CTX_end(ctx);
2129     if (generator != NULL)
2130         EC_POINT_free(generator);
2131     if (new_ctx != NULL)
2132         BN_CTX_free(new_ctx);
2133     if (pre)
2134         nistp521_pre_comp_free(pre);
2135     return ret;
2136 }
2137
2138 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2139 {
2140     if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2141                             nistp521_pre_comp_free,
2142                             nistp521_pre_comp_clear_free)
2143         != NULL)
2144         return 1;
2145     else
2146         return 0;
2147 }
2148
2149 #else
2150 static void *dummy = &dummy;
2151 #endif