crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/e_os2.h>
  41
  42 #include <string.h>
  43 #include <openssl/err.h>
  44 #include "ec_local.h"
  45
  46 #if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  47   /* even with gcc, the typedef won't work for 32-bit platforms */
  48 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  49                                  * platforms */
  50 #else
  51 # error "Your compiler doesn't appear to support 128-bit integer types"
  52 #endif
  53
  54 typedef uint8_t u8;
  55 typedef uint64_t u64;
  56
  57 /*
  58  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  59  * element of this field into 66 bytes where the most significant byte
  60  * contains only a single bit. We call this an felem_bytearray.
  61  */
  62
  63 typedef u8 felem_bytearray[66];
  64
  65 /*
  66  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  67  * These values are big-endian.
  68  */
  69 static const felem_bytearray nistp521_curve_params[5] = {
  70     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff},
  79     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  86      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  87      0xff, 0xfc},
  88     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  89      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  90      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  91      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  92      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  93      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  94      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  95      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  96      0x3f, 0x00},
  97     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  98      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  99      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
 100      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
 101      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 102      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 103      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 104      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 105      0xbd, 0x66},
 106     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 107      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 108      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 109      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 110      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 111      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 112      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 113      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 114      0x66, 0x50}
 115 };
 116
 117 /*-
 118  * The representation of field elements.
 119  * ------------------------------------
 120  *
 121  * We represent field elements with nine values. These values are either 64 or
 122  * 128 bits and the field element represented is:
 123  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 124  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 125  * 58 bits apart, but are greater than 58 bits in length, the most significant
 126  * bits of each limb overlap with the least significant bits of the next.
 127  *
 128  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 129  * 'largefelem' */
 130
 131 #define NLIMBS 9
 132
 133 typedef uint64_t limb;
 134 typedef limb felem[NLIMBS];
 135 typedef uint128_t largefelem[NLIMBS];
 136
 137 static const limb bottom57bits = 0x1ffffffffffffff;
 138 static const limb bottom58bits = 0x3ffffffffffffff;
 139
 140 /*
 141  * bin66_to_felem takes a little-endian byte array and converts it into felem
 142  * form. This assumes that the CPU is little-endian.
 143  */
 144 static void bin66_to_felem(felem out, const u8 in[66])
 145 {
 146     out[0] = (*((limb *) & in[0])) & bottom58bits;
 147     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 148     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 149     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 150     out[4] = (*((limb *) & in[29])) & bottom58bits;
 151     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 152     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 153     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 154     out[8] = (*((limb *) & in[58])) & bottom57bits;
 155 }
 156
 157 /*
 158  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 159  * array. This assumes that the CPU is little-endian.
 160  */
 161 static void felem_to_bin66(u8 out[66], const felem in)
 162 {
 163     memset(out, 0, 66);
 164     (*((limb *) & out[0])) = in[0];
 165     (*((limb *) & out[7])) |= in[1] << 2;
 166     (*((limb *) & out[14])) |= in[2] << 4;
 167     (*((limb *) & out[21])) |= in[3] << 6;
 168     (*((limb *) & out[29])) = in[4];
 169     (*((limb *) & out[36])) |= in[5] << 2;
 170     (*((limb *) & out[43])) |= in[6] << 4;
 171     (*((limb *) & out[50])) |= in[7] << 6;
 172     (*((limb *) & out[58])) = in[8];
 173 }
 174
 175 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 176 static int BN_to_felem(felem out, const BIGNUM *bn)
 177 {
 178     felem_bytearray b_out;
 179     int num_bytes;
 180
 181     if (BN_is_negative(bn)) {
 182         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 183         return 0;
 184     }
 185     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 186     if (num_bytes < 0) {
 187         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 188         return 0;
 189     }
 190     bin66_to_felem(out, b_out);
 191     return 1;
 192 }
 193
 194 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 195 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 196 {
 197     felem_bytearray b_out;
 198     felem_to_bin66(b_out, in);
 199     return BN_lebin2bn(b_out, sizeof(b_out), out);
 200 }
 201
 202 /*-
 203  * Field operations
 204  * ----------------
 205  */
 206
 207 static void felem_one(felem out)
 208 {
 209     out[0] = 1;
 210     out[1] = 0;
 211     out[2] = 0;
 212     out[3] = 0;
 213     out[4] = 0;
 214     out[5] = 0;
 215     out[6] = 0;
 216     out[7] = 0;
 217     out[8] = 0;
 218 }
 219
 220 static void felem_assign(felem out, const felem in)
 221 {
 222     out[0] = in[0];
 223     out[1] = in[1];
 224     out[2] = in[2];
 225     out[3] = in[3];
 226     out[4] = in[4];
 227     out[5] = in[5];
 228     out[6] = in[6];
 229     out[7] = in[7];
 230     out[8] = in[8];
 231 }
 232
 233 /* felem_sum64 sets out = out + in. */
 234 static void felem_sum64(felem out, const felem in)
 235 {
 236     out[0] += in[0];
 237     out[1] += in[1];
 238     out[2] += in[2];
 239     out[3] += in[3];
 240     out[4] += in[4];
 241     out[5] += in[5];
 242     out[6] += in[6];
 243     out[7] += in[7];
 244     out[8] += in[8];
 245 }
 246
 247 /* felem_scalar sets out = in * scalar */
 248 static void felem_scalar(felem out, const felem in, limb scalar)
 249 {
 250     out[0] = in[0] * scalar;
 251     out[1] = in[1] * scalar;
 252     out[2] = in[2] * scalar;
 253     out[3] = in[3] * scalar;
 254     out[4] = in[4] * scalar;
 255     out[5] = in[5] * scalar;
 256     out[6] = in[6] * scalar;
 257     out[7] = in[7] * scalar;
 258     out[8] = in[8] * scalar;
 259 }
 260
 261 /* felem_scalar64 sets out = out * scalar */
 262 static void felem_scalar64(felem out, limb scalar)
 263 {
 264     out[0] *= scalar;
 265     out[1] *= scalar;
 266     out[2] *= scalar;
 267     out[3] *= scalar;
 268     out[4] *= scalar;
 269     out[5] *= scalar;
 270     out[6] *= scalar;
 271     out[7] *= scalar;
 272     out[8] *= scalar;
 273 }
 274
 275 /* felem_scalar128 sets out = out * scalar */
 276 static void felem_scalar128(largefelem out, limb scalar)
 277 {
 278     out[0] *= scalar;
 279     out[1] *= scalar;
 280     out[2] *= scalar;
 281     out[3] *= scalar;
 282     out[4] *= scalar;
 283     out[5] *= scalar;
 284     out[6] *= scalar;
 285     out[7] *= scalar;
 286     out[8] *= scalar;
 287 }
 288
 289 /*-
 290  * felem_neg sets |out| to |-in|
 291  * On entry:
 292  *   in[i] < 2^59 + 2^14
 293  * On exit:
 294  *   out[i] < 2^62
 295  */
 296 static void felem_neg(felem out, const felem in)
 297 {
 298     /* In order to prevent underflow, we subtract from 0 mod p. */
 299     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 300     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 301
 302     out[0] = two62m3 - in[0];
 303     out[1] = two62m2 - in[1];
 304     out[2] = two62m2 - in[2];
 305     out[3] = two62m2 - in[3];
 306     out[4] = two62m2 - in[4];
 307     out[5] = two62m2 - in[5];
 308     out[6] = two62m2 - in[6];
 309     out[7] = two62m2 - in[7];
 310     out[8] = two62m2 - in[8];
 311 }
 312
 313 /*-
 314  * felem_diff64 subtracts |in| from |out|
 315  * On entry:
 316  *   in[i] < 2^59 + 2^14
 317  * On exit:
 318  *   out[i] < out[i] + 2^62
 319  */
 320 static void felem_diff64(felem out, const felem in)
 321 {
 322     /*
 323      * In order to prevent underflow, we add 0 mod p before subtracting.
 324      */
 325     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 326     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 327
 328     out[0] += two62m3 - in[0];
 329     out[1] += two62m2 - in[1];
 330     out[2] += two62m2 - in[2];
 331     out[3] += two62m2 - in[3];
 332     out[4] += two62m2 - in[4];
 333     out[5] += two62m2 - in[5];
 334     out[6] += two62m2 - in[6];
 335     out[7] += two62m2 - in[7];
 336     out[8] += two62m2 - in[8];
 337 }
 338
 339 /*-
 340  * felem_diff_128_64 subtracts |in| from |out|
 341  * On entry:
 342  *   in[i] < 2^62 + 2^17
 343  * On exit:
 344  *   out[i] < out[i] + 2^63
 345  */
 346 static void felem_diff_128_64(largefelem out, const felem in)
 347 {
 348     /*
 349      * In order to prevent underflow, we add 64p mod p (which is equivalent
 350      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 351      * digit number with all bits set to 1. See "The representation of field
 352      * elements" comment above for a description of how limbs are used to
 353      * represent a number. 64p is represented with 8 limbs containing a number
 354      * with 58 bits set and one limb with a number with 57 bits set.
 355      */
 356     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 357     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 358
 359     out[0] += two63m6 - in[0];
 360     out[1] += two63m5 - in[1];
 361     out[2] += two63m5 - in[2];
 362     out[3] += two63m5 - in[3];
 363     out[4] += two63m5 - in[4];
 364     out[5] += two63m5 - in[5];
 365     out[6] += two63m5 - in[6];
 366     out[7] += two63m5 - in[7];
 367     out[8] += two63m5 - in[8];
 368 }
 369
 370 /*-
 371  * felem_diff_128_64 subtracts |in| from |out|
 372  * On entry:
 373  *   in[i] < 2^126
 374  * On exit:
 375  *   out[i] < out[i] + 2^127 - 2^69
 376  */
 377 static void felem_diff128(largefelem out, const largefelem in)
 378 {
 379     /*
 380      * In order to prevent underflow, we add 0 mod p before subtracting.
 381      */
 382     static const uint128_t two127m70 =
 383         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 384     static const uint128_t two127m69 =
 385         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 386
 387     out[0] += (two127m70 - in[0]);
 388     out[1] += (two127m69 - in[1]);
 389     out[2] += (two127m69 - in[2]);
 390     out[3] += (two127m69 - in[3]);
 391     out[4] += (two127m69 - in[4]);
 392     out[5] += (two127m69 - in[5]);
 393     out[6] += (two127m69 - in[6]);
 394     out[7] += (two127m69 - in[7]);
 395     out[8] += (two127m69 - in[8]);
 396 }
 397
 398 /*-
 399  * felem_square sets |out| = |in|^2
 400  * On entry:
 401  *   in[i] < 2^62
 402  * On exit:
 403  *   out[i] < 17 * max(in[i]) * max(in[i])
 404  */
 405 static void felem_square(largefelem out, const felem in)
 406 {
 407     felem inx2, inx4;
 408     felem_scalar(inx2, in, 2);
 409     felem_scalar(inx4, in, 4);
 410
 411     /*-
 412      * We have many cases were we want to do
 413      *   in[x] * in[y] +
 414      *   in[y] * in[x]
 415      * This is obviously just
 416      *   2 * in[x] * in[y]
 417      * However, rather than do the doubling on the 128 bit result, we
 418      * double one of the inputs to the multiplication by reading from
 419      * |inx2|
 420      */
 421
 422     out[0] = ((uint128_t) in[0]) * in[0];
 423     out[1] = ((uint128_t) in[0]) * inx2[1];
 424     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 425     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 426     out[4] = ((uint128_t) in[0]) * inx2[4] +
 427              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 428     out[5] = ((uint128_t) in[0]) * inx2[5] +
 429              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 430     out[6] = ((uint128_t) in[0]) * inx2[6] +
 431              ((uint128_t) in[1]) * inx2[5] +
 432              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 433     out[7] = ((uint128_t) in[0]) * inx2[7] +
 434              ((uint128_t) in[1]) * inx2[6] +
 435              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 436     out[8] = ((uint128_t) in[0]) * inx2[8] +
 437              ((uint128_t) in[1]) * inx2[7] +
 438              ((uint128_t) in[2]) * inx2[6] +
 439              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 440
 441     /*
 442      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 443      * They correspond to locations one bit up from the limbs produced above
 444      * so we would have to multiply by two to align them. Again, rather than
 445      * operate on the 128-bit result, we double one of the inputs to the
 446      * multiplication. If we want to double for both this reason, and the
 447      * reason above, then we end up multiplying by four.
 448      */
 449
 450     /* 9 */
 451     out[0] += ((uint128_t) in[1]) * inx4[8] +
 452               ((uint128_t) in[2]) * inx4[7] +
 453               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 454
 455     /* 10 */
 456     out[1] += ((uint128_t) in[2]) * inx4[8] +
 457               ((uint128_t) in[3]) * inx4[7] +
 458               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 459
 460     /* 11 */
 461     out[2] += ((uint128_t) in[3]) * inx4[8] +
 462               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 463
 464     /* 12 */
 465     out[3] += ((uint128_t) in[4]) * inx4[8] +
 466               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 467
 468     /* 13 */
 469     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 470
 471     /* 14 */
 472     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 473
 474     /* 15 */
 475     out[6] += ((uint128_t) in[7]) * inx4[8];
 476
 477     /* 16 */
 478     out[7] += ((uint128_t) in[8]) * inx2[8];
 479 }
 480
 481 /*-
 482  * felem_mul sets |out| = |in1| * |in2|
 483  * On entry:
 484  *   in1[i] < 2^64
 485  *   in2[i] < 2^63
 486  * On exit:
 487  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 488  */
 489 static void felem_mul(largefelem out, const felem in1, const felem in2)
 490 {
 491     felem in2x2;
 492     felem_scalar(in2x2, in2, 2);
 493
 494     out[0] = ((uint128_t) in1[0]) * in2[0];
 495
 496     out[1] = ((uint128_t) in1[0]) * in2[1] +
 497              ((uint128_t) in1[1]) * in2[0];
 498
 499     out[2] = ((uint128_t) in1[0]) * in2[2] +
 500              ((uint128_t) in1[1]) * in2[1] +
 501              ((uint128_t) in1[2]) * in2[0];
 502
 503     out[3] = ((uint128_t) in1[0]) * in2[3] +
 504              ((uint128_t) in1[1]) * in2[2] +
 505              ((uint128_t) in1[2]) * in2[1] +
 506              ((uint128_t) in1[3]) * in2[0];
 507
 508     out[4] = ((uint128_t) in1[0]) * in2[4] +
 509              ((uint128_t) in1[1]) * in2[3] +
 510              ((uint128_t) in1[2]) * in2[2] +
 511              ((uint128_t) in1[3]) * in2[1] +
 512              ((uint128_t) in1[4]) * in2[0];
 513
 514     out[5] = ((uint128_t) in1[0]) * in2[5] +
 515              ((uint128_t) in1[1]) * in2[4] +
 516              ((uint128_t) in1[2]) * in2[3] +
 517              ((uint128_t) in1[3]) * in2[2] +
 518              ((uint128_t) in1[4]) * in2[1] +
 519              ((uint128_t) in1[5]) * in2[0];
 520
 521     out[6] = ((uint128_t) in1[0]) * in2[6] +
 522              ((uint128_t) in1[1]) * in2[5] +
 523              ((uint128_t) in1[2]) * in2[4] +
 524              ((uint128_t) in1[3]) * in2[3] +
 525              ((uint128_t) in1[4]) * in2[2] +
 526              ((uint128_t) in1[5]) * in2[1] +
 527              ((uint128_t) in1[6]) * in2[0];
 528
 529     out[7] = ((uint128_t) in1[0]) * in2[7] +
 530              ((uint128_t) in1[1]) * in2[6] +
 531              ((uint128_t) in1[2]) * in2[5] +
 532              ((uint128_t) in1[3]) * in2[4] +
 533              ((uint128_t) in1[4]) * in2[3] +
 534              ((uint128_t) in1[5]) * in2[2] +
 535              ((uint128_t) in1[6]) * in2[1] +
 536              ((uint128_t) in1[7]) * in2[0];
 537
 538     out[8] = ((uint128_t) in1[0]) * in2[8] +
 539              ((uint128_t) in1[1]) * in2[7] +
 540              ((uint128_t) in1[2]) * in2[6] +
 541              ((uint128_t) in1[3]) * in2[5] +
 542              ((uint128_t) in1[4]) * in2[4] +
 543              ((uint128_t) in1[5]) * in2[3] +
 544              ((uint128_t) in1[6]) * in2[2] +
 545              ((uint128_t) in1[7]) * in2[1] +
 546              ((uint128_t) in1[8]) * in2[0];
 547
 548     /* See comment in felem_square about the use of in2x2 here */
 549
 550     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 551               ((uint128_t) in1[2]) * in2x2[7] +
 552               ((uint128_t) in1[3]) * in2x2[6] +
 553               ((uint128_t) in1[4]) * in2x2[5] +
 554               ((uint128_t) in1[5]) * in2x2[4] +
 555               ((uint128_t) in1[6]) * in2x2[3] +
 556               ((uint128_t) in1[7]) * in2x2[2] +
 557               ((uint128_t) in1[8]) * in2x2[1];
 558
 559     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 560               ((uint128_t) in1[3]) * in2x2[7] +
 561               ((uint128_t) in1[4]) * in2x2[6] +
 562               ((uint128_t) in1[5]) * in2x2[5] +
 563               ((uint128_t) in1[6]) * in2x2[4] +
 564               ((uint128_t) in1[7]) * in2x2[3] +
 565               ((uint128_t) in1[8]) * in2x2[2];
 566
 567     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 568               ((uint128_t) in1[4]) * in2x2[7] +
 569               ((uint128_t) in1[5]) * in2x2[6] +
 570               ((uint128_t) in1[6]) * in2x2[5] +
 571               ((uint128_t) in1[7]) * in2x2[4] +
 572               ((uint128_t) in1[8]) * in2x2[3];
 573
 574     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 575               ((uint128_t) in1[5]) * in2x2[7] +
 576               ((uint128_t) in1[6]) * in2x2[6] +
 577               ((uint128_t) in1[7]) * in2x2[5] +
 578               ((uint128_t) in1[8]) * in2x2[4];
 579
 580     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 581               ((uint128_t) in1[6]) * in2x2[7] +
 582               ((uint128_t) in1[7]) * in2x2[6] +
 583               ((uint128_t) in1[8]) * in2x2[5];
 584
 585     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 586               ((uint128_t) in1[7]) * in2x2[7] +
 587               ((uint128_t) in1[8]) * in2x2[6];
 588
 589     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 590               ((uint128_t) in1[8]) * in2x2[7];
 591
 592     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 593 }
 594
 595 static const limb bottom52bits = 0xfffffffffffff;
 596
 597 /*-
 598  * felem_reduce converts a largefelem to an felem.
 599  * On entry:
 600  *   in[i] < 2^128
 601  * On exit:
 602  *   out[i] < 2^59 + 2^14
 603  */
 604 static void felem_reduce(felem out, const largefelem in)
 605 {
 606     u64 overflow1, overflow2;
 607
 608     out[0] = ((limb) in[0]) & bottom58bits;
 609     out[1] = ((limb) in[1]) & bottom58bits;
 610     out[2] = ((limb) in[2]) & bottom58bits;
 611     out[3] = ((limb) in[3]) & bottom58bits;
 612     out[4] = ((limb) in[4]) & bottom58bits;
 613     out[5] = ((limb) in[5]) & bottom58bits;
 614     out[6] = ((limb) in[6]) & bottom58bits;
 615     out[7] = ((limb) in[7]) & bottom58bits;
 616     out[8] = ((limb) in[8]) & bottom58bits;
 617
 618     /* out[i] < 2^58 */
 619
 620     out[1] += ((limb) in[0]) >> 58;
 621     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 622     /*-
 623      * out[1] < 2^58 + 2^6 + 2^58
 624      *        = 2^59 + 2^6
 625      */
 626     out[2] += ((limb) (in[0] >> 64)) >> 52;
 627
 628     out[2] += ((limb) in[1]) >> 58;
 629     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 630     out[3] += ((limb) (in[1] >> 64)) >> 52;
 631
 632     out[3] += ((limb) in[2]) >> 58;
 633     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 634     out[4] += ((limb) (in[2] >> 64)) >> 52;
 635
 636     out[4] += ((limb) in[3]) >> 58;
 637     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 638     out[5] += ((limb) (in[3] >> 64)) >> 52;
 639
 640     out[5] += ((limb) in[4]) >> 58;
 641     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 642     out[6] += ((limb) (in[4] >> 64)) >> 52;
 643
 644     out[6] += ((limb) in[5]) >> 58;
 645     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 646     out[7] += ((limb) (in[5] >> 64)) >> 52;
 647
 648     out[7] += ((limb) in[6]) >> 58;
 649     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 650     out[8] += ((limb) (in[6] >> 64)) >> 52;
 651
 652     out[8] += ((limb) in[7]) >> 58;
 653     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 654     /*-
 655      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 656      *            < 2^59 + 2^13
 657      */
 658     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 659
 660     overflow1 += ((limb) in[8]) >> 58;
 661     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 662     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 663
 664     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 665     overflow2 <<= 1;            /* overflow2 < 2^13 */
 666
 667     out[0] += overflow1;        /* out[0] < 2^60 */
 668     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 669
 670     out[1] += out[0] >> 58;
 671     out[0] &= bottom58bits;
 672     /*-
 673      * out[0] < 2^58
 674      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 675      *        < 2^59 + 2^14
 676      */
 677 }
 678
 679 static void felem_square_reduce(felem out, const felem in)
 680 {
 681     largefelem tmp;
 682     felem_square(tmp, in);
 683     felem_reduce(out, tmp);
 684 }
 685
 686 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 687 {
 688     largefelem tmp;
 689     felem_mul(tmp, in1, in2);
 690     felem_reduce(out, tmp);
 691 }
 692
 693 /*-
 694  * felem_inv calculates |out| = |in|^{-1}
 695  *
 696  * Based on Fermat's Little Theorem:
 697  *   a^p = a (mod p)
 698  *   a^{p-1} = 1 (mod p)
 699  *   a^{p-2} = a^{-1} (mod p)
 700  */
 701 static void felem_inv(felem out, const felem in)
 702 {
 703     felem ftmp, ftmp2, ftmp3, ftmp4;
 704     largefelem tmp;
 705     unsigned i;
 706
 707     felem_square(tmp, in);
 708     felem_reduce(ftmp, tmp);    /* 2^1 */
 709     felem_mul(tmp, in, ftmp);
 710     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 711     felem_assign(ftmp2, ftmp);
 712     felem_square(tmp, ftmp);
 713     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 714     felem_mul(tmp, in, ftmp);
 715     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 716     felem_square(tmp, ftmp);
 717     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 718
 719     felem_square(tmp, ftmp2);
 720     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 721     felem_square(tmp, ftmp3);
 722     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 723     felem_mul(tmp, ftmp3, ftmp2);
 724     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 725
 726     felem_assign(ftmp2, ftmp3);
 727     felem_square(tmp, ftmp3);
 728     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 729     felem_square(tmp, ftmp3);
 730     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 731     felem_square(tmp, ftmp3);
 732     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 733     felem_square(tmp, ftmp3);
 734     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 735     felem_assign(ftmp4, ftmp3);
 736     felem_mul(tmp, ftmp3, ftmp);
 737     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 738     felem_square(tmp, ftmp4);
 739     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 740     felem_mul(tmp, ftmp3, ftmp2);
 741     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 742     felem_assign(ftmp2, ftmp3);
 743
 744     for (i = 0; i < 8; i++) {
 745         felem_square(tmp, ftmp3);
 746         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 747     }
 748     felem_mul(tmp, ftmp3, ftmp2);
 749     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 750     felem_assign(ftmp2, ftmp3);
 751
 752     for (i = 0; i < 16; i++) {
 753         felem_square(tmp, ftmp3);
 754         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 755     }
 756     felem_mul(tmp, ftmp3, ftmp2);
 757     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 758     felem_assign(ftmp2, ftmp3);
 759
 760     for (i = 0; i < 32; i++) {
 761         felem_square(tmp, ftmp3);
 762         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 763     }
 764     felem_mul(tmp, ftmp3, ftmp2);
 765     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 766     felem_assign(ftmp2, ftmp3);
 767
 768     for (i = 0; i < 64; i++) {
 769         felem_square(tmp, ftmp3);
 770         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 771     }
 772     felem_mul(tmp, ftmp3, ftmp2);
 773     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 774     felem_assign(ftmp2, ftmp3);
 775
 776     for (i = 0; i < 128; i++) {
 777         felem_square(tmp, ftmp3);
 778         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 779     }
 780     felem_mul(tmp, ftmp3, ftmp2);
 781     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 782     felem_assign(ftmp2, ftmp3);
 783
 784     for (i = 0; i < 256; i++) {
 785         felem_square(tmp, ftmp3);
 786         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 787     }
 788     felem_mul(tmp, ftmp3, ftmp2);
 789     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 790
 791     for (i = 0; i < 9; i++) {
 792         felem_square(tmp, ftmp3);
 793         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 794     }
 795     felem_mul(tmp, ftmp3, ftmp4);
 796     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 797     felem_mul(tmp, ftmp3, in);
 798     felem_reduce(out, tmp);     /* 2^512 - 3 */
 799 }
 800
 801 /* This is 2^521-1, expressed as an felem */
 802 static const felem kPrime = {
 803     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 804     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 805     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 806 };
 807
 808 /*-
 809  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 810  * otherwise.
 811  * On entry:
 812  *   in[i] < 2^59 + 2^14
 813  */
 814 static limb felem_is_zero(const felem in)
 815 {
 816     felem ftmp;
 817     limb is_zero, is_p;
 818     felem_assign(ftmp, in);
 819
 820     ftmp[0] += ftmp[8] >> 57;
 821     ftmp[8] &= bottom57bits;
 822     /* ftmp[8] < 2^57 */
 823     ftmp[1] += ftmp[0] >> 58;
 824     ftmp[0] &= bottom58bits;
 825     ftmp[2] += ftmp[1] >> 58;
 826     ftmp[1] &= bottom58bits;
 827     ftmp[3] += ftmp[2] >> 58;
 828     ftmp[2] &= bottom58bits;
 829     ftmp[4] += ftmp[3] >> 58;
 830     ftmp[3] &= bottom58bits;
 831     ftmp[5] += ftmp[4] >> 58;
 832     ftmp[4] &= bottom58bits;
 833     ftmp[6] += ftmp[5] >> 58;
 834     ftmp[5] &= bottom58bits;
 835     ftmp[7] += ftmp[6] >> 58;
 836     ftmp[6] &= bottom58bits;
 837     ftmp[8] += ftmp[7] >> 58;
 838     ftmp[7] &= bottom58bits;
 839     /* ftmp[8] < 2^57 + 4 */
 840
 841     /*
 842      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 843      * than our bound for ftmp[8]. Therefore we only have to check if the
 844      * zero is zero or 2^521-1.
 845      */
 846
 847     is_zero = 0;
 848     is_zero |= ftmp[0];
 849     is_zero |= ftmp[1];
 850     is_zero |= ftmp[2];
 851     is_zero |= ftmp[3];
 852     is_zero |= ftmp[4];
 853     is_zero |= ftmp[5];
 854     is_zero |= ftmp[6];
 855     is_zero |= ftmp[7];
 856     is_zero |= ftmp[8];
 857
 858     is_zero--;
 859     /*
 860      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 861      * can be set is if is_zero was 0 before the decrement.
 862      */
 863     is_zero = 0 - (is_zero >> 63);
 864
 865     is_p = ftmp[0] ^ kPrime[0];
 866     is_p |= ftmp[1] ^ kPrime[1];
 867     is_p |= ftmp[2] ^ kPrime[2];
 868     is_p |= ftmp[3] ^ kPrime[3];
 869     is_p |= ftmp[4] ^ kPrime[4];
 870     is_p |= ftmp[5] ^ kPrime[5];
 871     is_p |= ftmp[6] ^ kPrime[6];
 872     is_p |= ftmp[7] ^ kPrime[7];
 873     is_p |= ftmp[8] ^ kPrime[8];
 874
 875     is_p--;
 876     is_p = 0 - (is_p >> 63);
 877
 878     is_zero |= is_p;
 879     return is_zero;
 880 }
 881
 882 static int felem_is_zero_int(const void *in)
 883 {
 884     return (int)(felem_is_zero(in) & ((limb) 1));
 885 }
 886
 887 /*-
 888  * felem_contract converts |in| to its unique, minimal representation.
 889  * On entry:
 890  *   in[i] < 2^59 + 2^14
 891  */
 892 static void felem_contract(felem out, const felem in)
 893 {
 894     limb is_p, is_greater, sign;
 895     static const limb two58 = ((limb) 1) << 58;
 896
 897     felem_assign(out, in);
 898
 899     out[0] += out[8] >> 57;
 900     out[8] &= bottom57bits;
 901     /* out[8] < 2^57 */
 902     out[1] += out[0] >> 58;
 903     out[0] &= bottom58bits;
 904     out[2] += out[1] >> 58;
 905     out[1] &= bottom58bits;
 906     out[3] += out[2] >> 58;
 907     out[2] &= bottom58bits;
 908     out[4] += out[3] >> 58;
 909     out[3] &= bottom58bits;
 910     out[5] += out[4] >> 58;
 911     out[4] &= bottom58bits;
 912     out[6] += out[5] >> 58;
 913     out[5] &= bottom58bits;
 914     out[7] += out[6] >> 58;
 915     out[6] &= bottom58bits;
 916     out[8] += out[7] >> 58;
 917     out[7] &= bottom58bits;
 918     /* out[8] < 2^57 + 4 */
 919
 920     /*
 921      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 922      * out. See the comments in felem_is_zero regarding why we don't test for
 923      * other multiples of the prime.
 924      */
 925
 926     /*
 927      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 928      */
 929
 930     is_p = out[0] ^ kPrime[0];
 931     is_p |= out[1] ^ kPrime[1];
 932     is_p |= out[2] ^ kPrime[2];
 933     is_p |= out[3] ^ kPrime[3];
 934     is_p |= out[4] ^ kPrime[4];
 935     is_p |= out[5] ^ kPrime[5];
 936     is_p |= out[6] ^ kPrime[6];
 937     is_p |= out[7] ^ kPrime[7];
 938     is_p |= out[8] ^ kPrime[8];
 939
 940     is_p--;
 941     is_p &= is_p << 32;
 942     is_p &= is_p << 16;
 943     is_p &= is_p << 8;
 944     is_p &= is_p << 4;
 945     is_p &= is_p << 2;
 946     is_p &= is_p << 1;
 947     is_p = 0 - (is_p >> 63);
 948     is_p = ~is_p;
 949
 950     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 951
 952     out[0] &= is_p;
 953     out[1] &= is_p;
 954     out[2] &= is_p;
 955     out[3] &= is_p;
 956     out[4] &= is_p;
 957     out[5] &= is_p;
 958     out[6] &= is_p;
 959     out[7] &= is_p;
 960     out[8] &= is_p;
 961
 962     /*
 963      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 964      * 57 is greater than zero as (2^521-1) + x >= 2^522
 965      */
 966     is_greater = out[8] >> 57;
 967     is_greater |= is_greater << 32;
 968     is_greater |= is_greater << 16;
 969     is_greater |= is_greater << 8;
 970     is_greater |= is_greater << 4;
 971     is_greater |= is_greater << 2;
 972     is_greater |= is_greater << 1;
 973     is_greater = 0 - (is_greater >> 63);
 974
 975     out[0] -= kPrime[0] & is_greater;
 976     out[1] -= kPrime[1] & is_greater;
 977     out[2] -= kPrime[2] & is_greater;
 978     out[3] -= kPrime[3] & is_greater;
 979     out[4] -= kPrime[4] & is_greater;
 980     out[5] -= kPrime[5] & is_greater;
 981     out[6] -= kPrime[6] & is_greater;
 982     out[7] -= kPrime[7] & is_greater;
 983     out[8] -= kPrime[8] & is_greater;
 984
 985     /* Eliminate negative coefficients */
 986     sign = -(out[0] >> 63);
 987     out[0] += (two58 & sign);
 988     out[1] -= (1 & sign);
 989     sign = -(out[1] >> 63);
 990     out[1] += (two58 & sign);
 991     out[2] -= (1 & sign);
 992     sign = -(out[2] >> 63);
 993     out[2] += (two58 & sign);
 994     out[3] -= (1 & sign);
 995     sign = -(out[3] >> 63);
 996     out[3] += (two58 & sign);
 997     out[4] -= (1 & sign);
 998     sign = -(out[4] >> 63);
 999     out[4] += (two58 & sign);
1000     out[5] -= (1 & sign);
1001     sign = -(out[0] >> 63);
1002     out[5] += (two58 & sign);
1003     out[6] -= (1 & sign);
1004     sign = -(out[6] >> 63);
1005     out[6] += (two58 & sign);
1006     out[7] -= (1 & sign);
1007     sign = -(out[7] >> 63);
1008     out[7] += (two58 & sign);
1009     out[8] -= (1 & sign);
1010     sign = -(out[5] >> 63);
1011     out[5] += (two58 & sign);
1012     out[6] -= (1 & sign);
1013     sign = -(out[6] >> 63);
1014     out[6] += (two58 & sign);
1015     out[7] -= (1 & sign);
1016     sign = -(out[7] >> 63);
1017     out[7] += (two58 & sign);
1018     out[8] -= (1 & sign);
1019 }
1020
1021 /*-
1022  * Group operations
1023  * ----------------
1024  *
1025  * Building on top of the field operations we have the operations on the
1026  * elliptic curve group itself. Points on the curve are represented in Jacobian
1027  * coordinates */
1028
1029 /*-
1030  * point_double calculates 2*(x_in, y_in, z_in)
1031  *
1032  * The method is taken from:
1033  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1034  *
1035  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1036  * while x_out == y_in is not (maybe this works, but it's not tested). */
1037 static void
1038 point_double(felem x_out, felem y_out, felem z_out,
1039              const felem x_in, const felem y_in, const felem z_in)
1040 {
1041     largefelem tmp, tmp2;
1042     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1043
1044     felem_assign(ftmp, x_in);
1045     felem_assign(ftmp2, x_in);
1046
1047     /* delta = z^2 */
1048     felem_square(tmp, z_in);
1049     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1050
1051     /* gamma = y^2 */
1052     felem_square(tmp, y_in);
1053     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1054
1055     /* beta = x*gamma */
1056     felem_mul(tmp, x_in, gamma);
1057     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1058
1059     /* alpha = 3*(x-delta)*(x+delta) */
1060     felem_diff64(ftmp, delta);
1061     /* ftmp[i] < 2^61 */
1062     felem_sum64(ftmp2, delta);
1063     /* ftmp2[i] < 2^60 + 2^15 */
1064     felem_scalar64(ftmp2, 3);
1065     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1066     felem_mul(tmp, ftmp, ftmp2);
1067     /*-
1068      * tmp[i] < 17(3*2^121 + 3*2^76)
1069      *        = 61*2^121 + 61*2^76
1070      *        < 64*2^121 + 64*2^76
1071      *        = 2^127 + 2^82
1072      *        < 2^128
1073      */
1074     felem_reduce(alpha, tmp);
1075
1076     /* x' = alpha^2 - 8*beta */
1077     felem_square(tmp, alpha);
1078     /*
1079      * tmp[i] < 17*2^120 < 2^125
1080      */
1081     felem_assign(ftmp, beta);
1082     felem_scalar64(ftmp, 8);
1083     /* ftmp[i] < 2^62 + 2^17 */
1084     felem_diff_128_64(tmp, ftmp);
1085     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1086     felem_reduce(x_out, tmp);
1087
1088     /* z' = (y + z)^2 - gamma - delta */
1089     felem_sum64(delta, gamma);
1090     /* delta[i] < 2^60 + 2^15 */
1091     felem_assign(ftmp, y_in);
1092     felem_sum64(ftmp, z_in);
1093     /* ftmp[i] < 2^60 + 2^15 */
1094     felem_square(tmp, ftmp);
1095     /*
1096      * tmp[i] < 17(2^122) < 2^127
1097      */
1098     felem_diff_128_64(tmp, delta);
1099     /* tmp[i] < 2^127 + 2^63 */
1100     felem_reduce(z_out, tmp);
1101
1102     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1103     felem_scalar64(beta, 4);
1104     /* beta[i] < 2^61 + 2^16 */
1105     felem_diff64(beta, x_out);
1106     /* beta[i] < 2^61 + 2^60 + 2^16 */
1107     felem_mul(tmp, alpha, beta);
1108     /*-
1109      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1110      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1111      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1112      *        < 2^128
1113      */
1114     felem_square(tmp2, gamma);
1115     /*-
1116      * tmp2[i] < 17*(2^59 + 2^14)^2
1117      *         = 17*(2^118 + 2^74 + 2^28)
1118      */
1119     felem_scalar128(tmp2, 8);
1120     /*-
1121      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1122      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1123      *         < 2^126
1124      */
1125     felem_diff128(tmp, tmp2);
1126     /*-
1127      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1128      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1129      *          2^74 + 2^69 + 2^34 + 2^30
1130      *        < 2^128
1131      */
1132     felem_reduce(y_out, tmp);
1133 }
1134
1135 /* copy_conditional copies in to out iff mask is all ones. */
1136 static void copy_conditional(felem out, const felem in, limb mask)
1137 {
1138     unsigned i;
1139     for (i = 0; i < NLIMBS; ++i) {
1140         const limb tmp = mask & (in[i] ^ out[i]);
1141         out[i] ^= tmp;
1142     }
1143 }
1144
1145 /*-
1146  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1147  *
1148  * The method is taken from
1149  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1150  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1151  *
1152  * This function includes a branch for checking whether the two input points
1153  * are equal (while not equal to the point at infinity). See comment below
1154  * on constant-time.
1155  */
1156 static void point_add(felem x3, felem y3, felem z3,
1157                       const felem x1, const felem y1, const felem z1,
1158                       const int mixed, const felem x2, const felem y2,
1159                       const felem z2)
1160 {
1161     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1162     largefelem tmp, tmp2;
1163     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1164     limb points_equal;
1165
1166     z1_is_zero = felem_is_zero(z1);
1167     z2_is_zero = felem_is_zero(z2);
1168
1169     /* ftmp = z1z1 = z1**2 */
1170     felem_square(tmp, z1);
1171     felem_reduce(ftmp, tmp);
1172
1173     if (!mixed) {
1174         /* ftmp2 = z2z2 = z2**2 */
1175         felem_square(tmp, z2);
1176         felem_reduce(ftmp2, tmp);
1177
1178         /* u1 = ftmp3 = x1*z2z2 */
1179         felem_mul(tmp, x1, ftmp2);
1180         felem_reduce(ftmp3, tmp);
1181
1182         /* ftmp5 = z1 + z2 */
1183         felem_assign(ftmp5, z1);
1184         felem_sum64(ftmp5, z2);
1185         /* ftmp5[i] < 2^61 */
1186
1187         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1188         felem_square(tmp, ftmp5);
1189         /* tmp[i] < 17*2^122 */
1190         felem_diff_128_64(tmp, ftmp);
1191         /* tmp[i] < 17*2^122 + 2^63 */
1192         felem_diff_128_64(tmp, ftmp2);
1193         /* tmp[i] < 17*2^122 + 2^64 */
1194         felem_reduce(ftmp5, tmp);
1195
1196         /* ftmp2 = z2 * z2z2 */
1197         felem_mul(tmp, ftmp2, z2);
1198         felem_reduce(ftmp2, tmp);
1199
1200         /* s1 = ftmp6 = y1 * z2**3 */
1201         felem_mul(tmp, y1, ftmp2);
1202         felem_reduce(ftmp6, tmp);
1203     } else {
1204         /*
1205          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1206          */
1207
1208         /* u1 = ftmp3 = x1*z2z2 */
1209         felem_assign(ftmp3, x1);
1210
1211         /* ftmp5 = 2*z1z2 */
1212         felem_scalar(ftmp5, z1, 2);
1213
1214         /* s1 = ftmp6 = y1 * z2**3 */
1215         felem_assign(ftmp6, y1);
1216     }
1217
1218     /* u2 = x2*z1z1 */
1219     felem_mul(tmp, x2, ftmp);
1220     /* tmp[i] < 17*2^120 */
1221
1222     /* h = ftmp4 = u2 - u1 */
1223     felem_diff_128_64(tmp, ftmp3);
1224     /* tmp[i] < 17*2^120 + 2^63 */
1225     felem_reduce(ftmp4, tmp);
1226
1227     x_equal = felem_is_zero(ftmp4);
1228
1229     /* z_out = ftmp5 * h */
1230     felem_mul(tmp, ftmp5, ftmp4);
1231     felem_reduce(z_out, tmp);
1232
1233     /* ftmp = z1 * z1z1 */
1234     felem_mul(tmp, ftmp, z1);
1235     felem_reduce(ftmp, tmp);
1236
1237     /* s2 = tmp = y2 * z1**3 */
1238     felem_mul(tmp, y2, ftmp);
1239     /* tmp[i] < 17*2^120 */
1240
1241     /* r = ftmp5 = (s2 - s1)*2 */
1242     felem_diff_128_64(tmp, ftmp6);
1243     /* tmp[i] < 17*2^120 + 2^63 */
1244     felem_reduce(ftmp5, tmp);
1245     y_equal = felem_is_zero(ftmp5);
1246     felem_scalar64(ftmp5, 2);
1247     /* ftmp5[i] < 2^61 */
1248
1249     /*
1250      * The formulae are incorrect if the points are equal, in affine coordinates
1251      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1252      * happens.
1253      *
1254      * We use bitwise operations to avoid potential side-channels introduced by
1255      * the short-circuiting behaviour of boolean operators.
1256      *
1257      * The special case of either point being the point at infinity (z1 and/or
1258      * z2 are zero), is handled separately later on in this function, so we
1259      * avoid jumping to point_double here in those special cases.
1260      *
1261      * Notice the comment below on the implications of this branching for timing
1262      * leaks and why it is considered practically irrelevant.
1263      */
1264     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1265
1266     if (points_equal) {
1267         /*
1268          * This is obviously not constant-time but it will almost-never happen
1269          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1270          * where the intermediate value gets very close to the group order.
1271          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1272          * the scalar, it's possible for the intermediate value to be a small
1273          * negative multiple of the base point, and for the final signed digit
1274          * to be the same value. We believe that this only occurs for the scalar
1275          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1276          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1277          * 71e913863f7, in that case the penultimate intermediate is -9G and
1278          * the final digit is also -9G. Since this only happens for a single
1279          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1280          * check whether a secret scalar was that exact value, can already do
1281          * so.)
1282          */
1283         point_double(x3, y3, z3, x1, y1, z1);
1284         return;
1285     }
1286
1287     /* I = ftmp = (2h)**2 */
1288     felem_assign(ftmp, ftmp4);
1289     felem_scalar64(ftmp, 2);
1290     /* ftmp[i] < 2^61 */
1291     felem_square(tmp, ftmp);
1292     /* tmp[i] < 17*2^122 */
1293     felem_reduce(ftmp, tmp);
1294
1295     /* J = ftmp2 = h * I */
1296     felem_mul(tmp, ftmp4, ftmp);
1297     felem_reduce(ftmp2, tmp);
1298
1299     /* V = ftmp4 = U1 * I */
1300     felem_mul(tmp, ftmp3, ftmp);
1301     felem_reduce(ftmp4, tmp);
1302
1303     /* x_out = r**2 - J - 2V */
1304     felem_square(tmp, ftmp5);
1305     /* tmp[i] < 17*2^122 */
1306     felem_diff_128_64(tmp, ftmp2);
1307     /* tmp[i] < 17*2^122 + 2^63 */
1308     felem_assign(ftmp3, ftmp4);
1309     felem_scalar64(ftmp4, 2);
1310     /* ftmp4[i] < 2^61 */
1311     felem_diff_128_64(tmp, ftmp4);
1312     /* tmp[i] < 17*2^122 + 2^64 */
1313     felem_reduce(x_out, tmp);
1314
1315     /* y_out = r(V-x_out) - 2 * s1 * J */
1316     felem_diff64(ftmp3, x_out);
1317     /*
1318      * ftmp3[i] < 2^60 + 2^60 = 2^61
1319      */
1320     felem_mul(tmp, ftmp5, ftmp3);
1321     /* tmp[i] < 17*2^122 */
1322     felem_mul(tmp2, ftmp6, ftmp2);
1323     /* tmp2[i] < 17*2^120 */
1324     felem_scalar128(tmp2, 2);
1325     /* tmp2[i] < 17*2^121 */
1326     felem_diff128(tmp, tmp2);
1327         /*-
1328          * tmp[i] < 2^127 - 2^69 + 17*2^122
1329          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1330          *        < 2^127
1331          */
1332     felem_reduce(y_out, tmp);
1333
1334     copy_conditional(x_out, x2, z1_is_zero);
1335     copy_conditional(x_out, x1, z2_is_zero);
1336     copy_conditional(y_out, y2, z1_is_zero);
1337     copy_conditional(y_out, y1, z2_is_zero);
1338     copy_conditional(z_out, z2, z1_is_zero);
1339     copy_conditional(z_out, z1, z2_is_zero);
1340     felem_assign(x3, x_out);
1341     felem_assign(y3, y_out);
1342     felem_assign(z3, z_out);
1343 }
1344
1345 /*-
1346  * Base point pre computation
1347  * --------------------------
1348  *
1349  * Two different sorts of precomputed tables are used in the following code.
1350  * Each contain various points on the curve, where each point is three field
1351  * elements (x, y, z).
1352  *
1353  * For the base point table, z is usually 1 (0 for the point at infinity).
1354  * This table has 16 elements:
1355  * index | bits    | point
1356  * ------+---------+------------------------------
1357  *     0 | 0 0 0 0 | 0G
1358  *     1 | 0 0 0 1 | 1G
1359  *     2 | 0 0 1 0 | 2^130G
1360  *     3 | 0 0 1 1 | (2^130 + 1)G
1361  *     4 | 0 1 0 0 | 2^260G
1362  *     5 | 0 1 0 1 | (2^260 + 1)G
1363  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1364  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1365  *     8 | 1 0 0 0 | 2^390G
1366  *     9 | 1 0 0 1 | (2^390 + 1)G
1367  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1368  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1369  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1370  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1371  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1372  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1373  *
1374  * The reason for this is so that we can clock bits into four different
1375  * locations when doing simple scalar multiplies against the base point.
1376  *
1377  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1378
1379 /* gmul is the table of precomputed base points */
1380 static const felem gmul[16][3] = {
1381 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1382  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1383  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1385   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1386   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1387  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1388   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1389   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1392   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1393   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1394  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1395   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1396   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1399   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1400   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1401  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1402   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1403   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1406   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1407   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1408  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1409   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1410   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1413   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1414   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1415  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1416   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1417   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1420   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1421   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1422  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1423   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1424   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1427   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1428   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1429  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1430   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1431   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1434   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1435   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1436  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1437   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1438   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1441   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1442   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1443  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1444   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1445   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1448   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1449   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1450  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1451   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1452   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1455   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1456   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1457  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1458   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1459   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1461 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1462   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1463   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1464  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1465   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1466   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1467  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1468 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1469   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1470   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1471  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1472   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1473   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1474  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1475 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1476   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1477   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1478  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1479   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1480   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1481  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1482 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1483   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1484   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1485  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1486   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1487   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1488  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1489 };
1490
1491 /*
1492  * select_point selects the |idx|th point from a precomputation table and
1493  * copies it to out.
1494  */
1495  /* pre_comp below is of the size provided in |size| */
1496 static void select_point(const limb idx, unsigned int size,
1497                          const felem pre_comp[][3], felem out[3])
1498 {
1499     unsigned i, j;
1500     limb *outlimbs = &out[0][0];
1501
1502     memset(out, 0, sizeof(*out) * 3);
1503
1504     for (i = 0; i < size; i++) {
1505         const limb *inlimbs = &pre_comp[i][0][0];
1506         limb mask = i ^ idx;
1507         mask |= mask >> 4;
1508         mask |= mask >> 2;
1509         mask |= mask >> 1;
1510         mask &= 1;
1511         mask--;
1512         for (j = 0; j < NLIMBS * 3; j++)
1513             outlimbs[j] |= inlimbs[j] & mask;
1514     }
1515 }
1516
1517 /* get_bit returns the |i|th bit in |in| */
1518 static char get_bit(const felem_bytearray in, int i)
1519 {
1520     if (i < 0)
1521         return 0;
1522     return (in[i >> 3] >> (i & 7)) & 1;
1523 }
1524
1525 /*
1526  * Interleaved point multiplication using precomputed point multiples: The
1527  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1528  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1529  * generator, using certain (large) precomputed multiples in g_pre_comp.
1530  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1531  */
1532 static void batch_mul(felem x_out, felem y_out, felem z_out,
1533                       const felem_bytearray scalars[],
1534                       const unsigned num_points, const u8 *g_scalar,
1535                       const int mixed, const felem pre_comp[][17][3],
1536                       const felem g_pre_comp[16][3])
1537 {
1538     int i, skip;
1539     unsigned num, gen_mul = (g_scalar != NULL);
1540     felem nq[3], tmp[4];
1541     limb bits;
1542     u8 sign, digit;
1543
1544     /* set nq to the point at infinity */
1545     memset(nq, 0, sizeof(nq));
1546
1547     /*
1548      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1549      * of the generator (last quarter of rounds) and additions of other
1550      * points multiples (every 5th round).
1551      */
1552     skip = 1;                   /* save two point operations in the first
1553                                  * round */
1554     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1555         /* double */
1556         if (!skip)
1557             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1558
1559         /* add multiples of the generator */
1560         if (gen_mul && (i <= 130)) {
1561             bits = get_bit(g_scalar, i + 390) << 3;
1562             if (i < 130) {
1563                 bits |= get_bit(g_scalar, i + 260) << 2;
1564                 bits |= get_bit(g_scalar, i + 130) << 1;
1565                 bits |= get_bit(g_scalar, i);
1566             }
1567             /* select the point to add, in constant time */
1568             select_point(bits, 16, g_pre_comp, tmp);
1569             if (!skip) {
1570                 /* The 1 argument below is for "mixed" */
1571                 point_add(nq[0], nq[1], nq[2],
1572                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1573             } else {
1574                 memcpy(nq, tmp, 3 * sizeof(felem));
1575                 skip = 0;
1576             }
1577         }
1578
1579         /* do other additions every 5 doublings */
1580         if (num_points && (i % 5 == 0)) {
1581             /* loop over all scalars */
1582             for (num = 0; num < num_points; ++num) {
1583                 bits = get_bit(scalars[num], i + 4) << 5;
1584                 bits |= get_bit(scalars[num], i + 3) << 4;
1585                 bits |= get_bit(scalars[num], i + 2) << 3;
1586                 bits |= get_bit(scalars[num], i + 1) << 2;
1587                 bits |= get_bit(scalars[num], i) << 1;
1588                 bits |= get_bit(scalars[num], i - 1);
1589                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1590
1591                 /*
1592                  * select the point to add or subtract, in constant time
1593                  */
1594                 select_point(digit, 17, pre_comp[num], tmp);
1595                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1596                                             * point */
1597                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1598
1599                 if (!skip) {
1600                     point_add(nq[0], nq[1], nq[2],
1601                               nq[0], nq[1], nq[2],
1602                               mixed, tmp[0], tmp[1], tmp[2]);
1603                 } else {
1604                     memcpy(nq, tmp, 3 * sizeof(felem));
1605                     skip = 0;
1606                 }
1607             }
1608         }
1609     }
1610     felem_assign(x_out, nq[0]);
1611     felem_assign(y_out, nq[1]);
1612     felem_assign(z_out, nq[2]);
1613 }
1614
1615 /* Precomputation for the group generator. */
1616 struct nistp521_pre_comp_st {
1617     felem g_pre_comp[16][3];
1618     CRYPTO_REF_COUNT references;
1619     CRYPTO_RWLOCK *lock;
1620 };
1621
1622 const EC_METHOD *EC_GFp_nistp521_method(void)
1623 {
1624     static const EC_METHOD ret = {
1625         EC_FLAGS_DEFAULT_OCT,
1626         NID_X9_62_prime_field,
1627         ec_GFp_nistp521_group_init,
1628         ec_GFp_simple_group_finish,
1629         ec_GFp_simple_group_clear_finish,
1630         ec_GFp_nist_group_copy,
1631         ec_GFp_nistp521_group_set_curve,
1632         ec_GFp_simple_group_get_curve,
1633         ec_GFp_simple_group_get_degree,
1634         ec_group_simple_order_bits,
1635         ec_GFp_simple_group_check_discriminant,
1636         ec_GFp_simple_point_init,
1637         ec_GFp_simple_point_finish,
1638         ec_GFp_simple_point_clear_finish,
1639         ec_GFp_simple_point_copy,
1640         ec_GFp_simple_point_set_to_infinity,
1641         ec_GFp_simple_point_set_affine_coordinates,
1642         ec_GFp_nistp521_point_get_affine_coordinates,
1643         0 /* point_set_compressed_coordinates */ ,
1644         0 /* point2oct */ ,
1645         0 /* oct2point */ ,
1646         ec_GFp_simple_add,
1647         ec_GFp_simple_dbl,
1648         ec_GFp_simple_invert,
1649         ec_GFp_simple_is_at_infinity,
1650         ec_GFp_simple_is_on_curve,
1651         ec_GFp_simple_cmp,
1652         ec_GFp_simple_make_affine,
1653         ec_GFp_simple_points_make_affine,
1654         ec_GFp_nistp521_points_mul,
1655         ec_GFp_nistp521_precompute_mult,
1656         ec_GFp_nistp521_have_precompute_mult,
1657         ec_GFp_nist_field_mul,
1658         ec_GFp_nist_field_sqr,
1659         0 /* field_div */ ,
1660         ec_GFp_simple_field_inv,
1661         0 /* field_encode */ ,
1662         0 /* field_decode */ ,
1663         0,                      /* field_set_to_one */
1664         ec_key_simple_priv2oct,
1665         ec_key_simple_oct2priv,
1666         0, /* set private */
1667         ec_key_simple_generate_key,
1668         ec_key_simple_check_key,
1669         ec_key_simple_generate_public_key,
1670         0, /* keycopy */
1671         0, /* keyfinish */
1672         ecdh_simple_compute_key,
1673         ecdsa_simple_sign_setup,
1674         ecdsa_simple_sign_sig,
1675         ecdsa_simple_verify_sig,
1676         0, /* field_inverse_mod_ord */
1677         0, /* blind_coordinates */
1678         0, /* ladder_pre */
1679         0, /* ladder_step */
1680         0  /* ladder_post */
1681     };
1682
1683     return &ret;
1684 }
1685
1686 /******************************************************************************/
1687 /*
1688  * FUNCTIONS TO MANAGE PRECOMPUTATION
1689  */
1690
1691 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1692 {
1693     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1694
1695     if (ret == NULL) {
1696         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1697         return ret;
1698     }
1699
1700     ret->references = 1;
1701
1702     ret->lock = CRYPTO_THREAD_lock_new();
1703     if (ret->lock == NULL) {
1704         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1705         OPENSSL_free(ret);
1706         return NULL;
1707     }
1708     return ret;
1709 }
1710
1711 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1712 {
1713     int i;
1714     if (p != NULL)
1715         CRYPTO_UP_REF(&p->references, &i, p->lock);
1716     return p;
1717 }
1718
1719 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1720 {
1721     int i;
1722
1723     if (p == NULL)
1724         return;
1725
1726     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1727     REF_PRINT_COUNT("EC_nistp521", x);
1728     if (i > 0)
1729         return;
1730     REF_ASSERT_ISNT(i < 0);
1731
1732     CRYPTO_THREAD_lock_free(p->lock);
1733     OPENSSL_free(p);
1734 }
1735
1736 /******************************************************************************/
1737 /*
1738  * OPENSSL EC_METHOD FUNCTIONS
1739  */
1740
1741 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1742 {
1743     int ret;
1744     ret = ec_GFp_simple_group_init(group);
1745     group->a_is_minus3 = 1;
1746     return ret;
1747 }
1748
1749 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1750                                     const BIGNUM *a, const BIGNUM *b,
1751                                     BN_CTX *ctx)
1752 {
1753     int ret = 0;
1754     BIGNUM *curve_p, *curve_a, *curve_b;
1755 #ifndef FIPS_MODE
1756     BN_CTX *new_ctx = NULL;
1757
1758     if (ctx == NULL)
1759         ctx = new_ctx = BN_CTX_new();
1760 #endif
1761     if (ctx == NULL)
1762         return 0;
1763
1764     BN_CTX_start(ctx);
1765     curve_p = BN_CTX_get(ctx);
1766     curve_a = BN_CTX_get(ctx);
1767     curve_b = BN_CTX_get(ctx);
1768     if (curve_b == NULL)
1769         goto err;
1770     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1771     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1772     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1773     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1774         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1775               EC_R_WRONG_CURVE_PARAMETERS);
1776         goto err;
1777     }
1778     group->field_mod_func = BN_nist_mod_521;
1779     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1780  err:
1781     BN_CTX_end(ctx);
1782 #ifndef FIPS_MODE
1783     BN_CTX_free(new_ctx);
1784 #endif
1785     return ret;
1786 }
1787
1788 /*
1789  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1790  * (X/Z^2, Y/Z^3)
1791  */
1792 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1793                                                  const EC_POINT *point,
1794                                                  BIGNUM *x, BIGNUM *y,
1795                                                  BN_CTX *ctx)
1796 {
1797     felem z1, z2, x_in, y_in, x_out, y_out;
1798     largefelem tmp;
1799
1800     if (EC_POINT_is_at_infinity(group, point)) {
1801         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1802               EC_R_POINT_AT_INFINITY);
1803         return 0;
1804     }
1805     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1806         (!BN_to_felem(z1, point->Z)))
1807         return 0;
1808     felem_inv(z2, z1);
1809     felem_square(tmp, z2);
1810     felem_reduce(z1, tmp);
1811     felem_mul(tmp, x_in, z1);
1812     felem_reduce(x_in, tmp);
1813     felem_contract(x_out, x_in);
1814     if (x != NULL) {
1815         if (!felem_to_BN(x, x_out)) {
1816             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1817                   ERR_R_BN_LIB);
1818             return 0;
1819         }
1820     }
1821     felem_mul(tmp, z1, z2);
1822     felem_reduce(z1, tmp);
1823     felem_mul(tmp, y_in, z1);
1824     felem_reduce(y_in, tmp);
1825     felem_contract(y_out, y_in);
1826     if (y != NULL) {
1827         if (!felem_to_BN(y, y_out)) {
1828             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1829                   ERR_R_BN_LIB);
1830             return 0;
1831         }
1832     }
1833     return 1;
1834 }
1835
1836 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1837 static void make_points_affine(size_t num, felem points[][3],
1838                                felem tmp_felems[])
1839 {
1840     /*
1841      * Runs in constant time, unless an input is the point at infinity (which
1842      * normally shouldn't happen).
1843      */
1844     ec_GFp_nistp_points_make_affine_internal(num,
1845                                              points,
1846                                              sizeof(felem),
1847                                              tmp_felems,
1848                                              (void (*)(void *))felem_one,
1849                                              felem_is_zero_int,
1850                                              (void (*)(void *, const void *))
1851                                              felem_assign,
1852                                              (void (*)(void *, const void *))
1853                                              felem_square_reduce, (void (*)
1854                                                                    (void *,
1855                                                                     const void
1856                                                                     *,
1857                                                                     const void
1858                                                                     *))
1859                                              felem_mul_reduce,
1860                                              (void (*)(void *, const void *))
1861                                              felem_inv,
1862                                              (void (*)(void *, const void *))
1863                                              felem_contract);
1864 }
1865
1866 /*
1867  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1868  * values Result is stored in r (r can equal one of the inputs).
1869  */
1870 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1871                                const BIGNUM *scalar, size_t num,
1872                                const EC_POINT *points[],
1873                                const BIGNUM *scalars[], BN_CTX *ctx)
1874 {
1875     int ret = 0;
1876     int j;
1877     int mixed = 0;
1878     BIGNUM *x, *y, *z, *tmp_scalar;
1879     felem_bytearray g_secret;
1880     felem_bytearray *secrets = NULL;
1881     felem (*pre_comp)[17][3] = NULL;
1882     felem *tmp_felems = NULL;
1883     unsigned i;
1884     int num_bytes;
1885     int have_pre_comp = 0;
1886     size_t num_points = num;
1887     felem x_in, y_in, z_in, x_out, y_out, z_out;
1888     NISTP521_PRE_COMP *pre = NULL;
1889     felem(*g_pre_comp)[3] = NULL;
1890     EC_POINT *generator = NULL;
1891     const EC_POINT *p = NULL;
1892     const BIGNUM *p_scalar = NULL;
1893
1894     BN_CTX_start(ctx);
1895     x = BN_CTX_get(ctx);
1896     y = BN_CTX_get(ctx);
1897     z = BN_CTX_get(ctx);
1898     tmp_scalar = BN_CTX_get(ctx);
1899     if (tmp_scalar == NULL)
1900         goto err;
1901
1902     if (scalar != NULL) {
1903         pre = group->pre_comp.nistp521;
1904         if (pre)
1905             /* we have precomputation, try to use it */
1906             g_pre_comp = &pre->g_pre_comp[0];
1907         else
1908             /* try to use the standard precomputation */
1909             g_pre_comp = (felem(*)[3]) gmul;
1910         generator = EC_POINT_new(group);
1911         if (generator == NULL)
1912             goto err;
1913         /* get the generator from precomputation */
1914         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1915             !felem_to_BN(y, g_pre_comp[1][1]) ||
1916             !felem_to_BN(z, g_pre_comp[1][2])) {
1917             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1918             goto err;
1919         }
1920         if (!ec_GFp_simple_set_Jprojective_coordinates_GFp(group, generator, x,
1921                                                            y, z, ctx))
1922             goto err;
1923         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1924             /* precomputation matches generator */
1925             have_pre_comp = 1;
1926         else
1927             /*
1928              * we don't have valid precomputation: treat the generator as a
1929              * random point
1930              */
1931             num_points++;
1932     }
1933
1934     if (num_points > 0) {
1935         if (num_points >= 2) {
1936             /*
1937              * unless we precompute multiples for just one point, converting
1938              * those into affine form is time well spent
1939              */
1940             mixed = 1;
1941         }
1942         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1943         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1944         if (mixed)
1945             tmp_felems =
1946                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1947         if ((secrets == NULL) || (pre_comp == NULL)
1948             || (mixed && (tmp_felems == NULL))) {
1949             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1950             goto err;
1951         }
1952
1953         /*
1954          * we treat NULL scalars as 0, and NULL points as points at infinity,
1955          * i.e., they contribute nothing to the linear combination
1956          */
1957         for (i = 0; i < num_points; ++i) {
1958             if (i == num) {
1959                 /*
1960                  * we didn't have a valid precomputation, so we pick the
1961                  * generator
1962                  */
1963                 p = EC_GROUP_get0_generator(group);
1964                 p_scalar = scalar;
1965             } else {
1966                 /* the i^th point */
1967                 p = points[i];
1968                 p_scalar = scalars[i];
1969             }
1970             if ((p_scalar != NULL) && (p != NULL)) {
1971                 /* reduce scalar to 0 <= scalar < 2^521 */
1972                 if ((BN_num_bits(p_scalar) > 521)
1973                     || (BN_is_negative(p_scalar))) {
1974                     /*
1975                      * this is an unusual input, and we don't guarantee
1976                      * constant-timeness
1977                      */
1978                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1979                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1980                         goto err;
1981                     }
1982                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1983                                                secrets[i], sizeof(secrets[i]));
1984                 } else {
1985                     num_bytes = BN_bn2lebinpad(p_scalar,
1986                                                secrets[i], sizeof(secrets[i]));
1987                 }
1988                 if (num_bytes < 0) {
1989                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1990                     goto err;
1991                 }
1992                 /* precompute multiples */
1993                 if ((!BN_to_felem(x_out, p->X)) ||
1994                     (!BN_to_felem(y_out, p->Y)) ||
1995                     (!BN_to_felem(z_out, p->Z)))
1996                     goto err;
1997                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1998                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1999                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
2000                 for (j = 2; j <= 16; ++j) {
2001                     if (j & 1) {
2002                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
2003                                   pre_comp[i][j][2], pre_comp[i][1][0],
2004                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
2005                                   pre_comp[i][j - 1][0],
2006                                   pre_comp[i][j - 1][1],
2007                                   pre_comp[i][j - 1][2]);
2008                     } else {
2009                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2010                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2011                                      pre_comp[i][j / 2][1],
2012                                      pre_comp[i][j / 2][2]);
2013                     }
2014                 }
2015             }
2016         }
2017         if (mixed)
2018             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2019     }
2020
2021     /* the scalar for the generator */
2022     if ((scalar != NULL) && (have_pre_comp)) {
2023         memset(g_secret, 0, sizeof(g_secret));
2024         /* reduce scalar to 0 <= scalar < 2^521 */
2025         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2026             /*
2027              * this is an unusual input, and we don't guarantee
2028              * constant-timeness
2029              */
2030             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2031                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2032                 goto err;
2033             }
2034             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2035         } else {
2036             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2037         }
2038         /* do the multiplication with generator precomputation */
2039         batch_mul(x_out, y_out, z_out,
2040                   (const felem_bytearray(*))secrets, num_points,
2041                   g_secret,
2042                   mixed, (const felem(*)[17][3])pre_comp,
2043                   (const felem(*)[3])g_pre_comp);
2044     } else {
2045         /* do the multiplication without generator precomputation */
2046         batch_mul(x_out, y_out, z_out,
2047                   (const felem_bytearray(*))secrets, num_points,
2048                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2049     }
2050     /* reduce the output to its unique minimal representation */
2051     felem_contract(x_in, x_out);
2052     felem_contract(y_in, y_out);
2053     felem_contract(z_in, z_out);
2054     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2055         (!felem_to_BN(z, z_in))) {
2056         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2057         goto err;
2058     }
2059     ret = ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2060
2061  err:
2062     BN_CTX_end(ctx);
2063     EC_POINT_free(generator);
2064     OPENSSL_free(secrets);
2065     OPENSSL_free(pre_comp);
2066     OPENSSL_free(tmp_felems);
2067     return ret;
2068 }
2069
2070 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2071 {
2072     int ret = 0;
2073     NISTP521_PRE_COMP *pre = NULL;
2074     int i, j;
2075     BIGNUM *x, *y;
2076     EC_POINT *generator = NULL;
2077     felem tmp_felems[16];
2078 #ifndef FIPS_MODE
2079     BN_CTX *new_ctx = NULL;
2080 #endif
2081
2082     /* throw away old precomputation */
2083     EC_pre_comp_free(group);
2084
2085 #ifndef FIPS_MODE
2086     if (ctx == NULL)
2087         ctx = new_ctx = BN_CTX_new();
2088 #endif
2089     if (ctx == NULL)
2090         return 0;
2091
2092     BN_CTX_start(ctx);
2093     x = BN_CTX_get(ctx);
2094     y = BN_CTX_get(ctx);
2095     if (y == NULL)
2096         goto err;
2097     /* get the generator */
2098     if (group->generator == NULL)
2099         goto err;
2100     generator = EC_POINT_new(group);
2101     if (generator == NULL)
2102         goto err;
2103     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2104     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2105     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2106         goto err;
2107     if ((pre = nistp521_pre_comp_new()) == NULL)
2108         goto err;
2109     /*
2110      * if the generator is the standard one, use built-in precomputation
2111      */
2112     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2113         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2114         goto done;
2115     }
2116     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2117         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2118         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2119         goto err;
2120     /* compute 2^130*G, 2^260*G, 2^390*G */
2121     for (i = 1; i <= 4; i <<= 1) {
2122         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2123                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2124                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2125         for (j = 0; j < 129; ++j) {
2126             point_double(pre->g_pre_comp[2 * i][0],
2127                          pre->g_pre_comp[2 * i][1],
2128                          pre->g_pre_comp[2 * i][2],
2129                          pre->g_pre_comp[2 * i][0],
2130                          pre->g_pre_comp[2 * i][1],
2131                          pre->g_pre_comp[2 * i][2]);
2132         }
2133     }
2134     /* g_pre_comp[0] is the point at infinity */
2135     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2136     /* the remaining multiples */
2137     /* 2^130*G + 2^260*G */
2138     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2139               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2140               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2141               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2142               pre->g_pre_comp[2][2]);
2143     /* 2^130*G + 2^390*G */
2144     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2145               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2146               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2147               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2148               pre->g_pre_comp[2][2]);
2149     /* 2^260*G + 2^390*G */
2150     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2151               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2152               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2153               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2154               pre->g_pre_comp[4][2]);
2155     /* 2^130*G + 2^260*G + 2^390*G */
2156     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2157               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2158               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2159               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2160               pre->g_pre_comp[2][2]);
2161     for (i = 1; i < 8; ++i) {
2162         /* odd multiples: add G */
2163         point_add(pre->g_pre_comp[2 * i + 1][0],
2164                   pre->g_pre_comp[2 * i + 1][1],
2165                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2166                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2167                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2168                   pre->g_pre_comp[1][2]);
2169     }
2170     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2171
2172  done:
2173     SETPRECOMP(group, nistp521, pre);
2174     ret = 1;
2175     pre = NULL;
2176  err:
2177     BN_CTX_end(ctx);
2178     EC_POINT_free(generator);
2179 #ifndef FIPS_MODE
2180     BN_CTX_free(new_ctx);
2181 #endif
2182     EC_nistp521_pre_comp_free(pre);
2183     return ret;
2184 }
2185
2186 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2187 {
2188     return HAVEPRECOMP(group, nistp521);
2189 }