crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/e_os2.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_local.h"
  42
  43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Your compiler doesn't appear to support 128-bit integer types"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /*
  55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  56  * element of this field into 66 bytes where the most significant byte
  57  * contains only a single bit. We call this an felem_bytearray.
  58  */
  59
  60 typedef u8 felem_bytearray[66];
  61
  62 /*
  63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  64  * These values are big-endian.
  65  */
  66 static const felem_bytearray nistp521_curve_params[5] = {
  67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff},
  76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xfc},
  85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  93      0x3f, 0x00},
  94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 102      0xbd, 0x66},
 103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 111      0x66, 0x50}
 112 };
 113
 114 /*-
 115  * The representation of field elements.
 116  * ------------------------------------
 117  *
 118  * We represent field elements with nine values. These values are either 64 or
 119  * 128 bits and the field element represented is:
 120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 122  * 58 bits apart, but are greater than 58 bits in length, the most significant
 123  * bits of each limb overlap with the least significant bits of the next.
 124  *
 125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 126  * 'largefelem' */
 127
 128 # define NLIMBS 9
 129
 130 typedef uint64_t limb;
 131 typedef limb felem[NLIMBS];
 132 typedef uint128_t largefelem[NLIMBS];
 133
 134 static const limb bottom57bits = 0x1ffffffffffffff;
 135 static const limb bottom58bits = 0x3ffffffffffffff;
 136
 137 /*
 138  * bin66_to_felem takes a little-endian byte array and converts it into felem
 139  * form. This assumes that the CPU is little-endian.
 140  */
 141 static void bin66_to_felem(felem out, const u8 in[66])
 142 {
 143     out[0] = (*((limb *) & in[0])) & bottom58bits;
 144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 147     out[4] = (*((limb *) & in[29])) & bottom58bits;
 148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 151     out[8] = (*((limb *) & in[58])) & bottom57bits;
 152 }
 153
 154 /*
 155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 156  * array. This assumes that the CPU is little-endian.
 157  */
 158 static void felem_to_bin66(u8 out[66], const felem in)
 159 {
 160     memset(out, 0, 66);
 161     (*((limb *) & out[0])) = in[0];
 162     (*((limb *) & out[7])) |= in[1] << 2;
 163     (*((limb *) & out[14])) |= in[2] << 4;
 164     (*((limb *) & out[21])) |= in[3] << 6;
 165     (*((limb *) & out[29])) = in[4];
 166     (*((limb *) & out[36])) |= in[5] << 2;
 167     (*((limb *) & out[43])) |= in[6] << 4;
 168     (*((limb *) & out[50])) |= in[7] << 6;
 169     (*((limb *) & out[58])) = in[8];
 170 }
 171
 172 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 173 static int BN_to_felem(felem out, const BIGNUM *bn)
 174 {
 175     felem_bytearray b_out;
 176     int num_bytes;
 177
 178     if (BN_is_negative(bn)) {
 179         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 180         return 0;
 181     }
 182     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 183     if (num_bytes < 0) {
 184         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 185         return 0;
 186     }
 187     bin66_to_felem(out, b_out);
 188     return 1;
 189 }
 190
 191 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 192 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 193 {
 194     felem_bytearray b_out;
 195     felem_to_bin66(b_out, in);
 196     return BN_lebin2bn(b_out, sizeof(b_out), out);
 197 }
 198
 199 /*-
 200  * Field operations
 201  * ----------------
 202  */
 203
 204 static void felem_one(felem out)
 205 {
 206     out[0] = 1;
 207     out[1] = 0;
 208     out[2] = 0;
 209     out[3] = 0;
 210     out[4] = 0;
 211     out[5] = 0;
 212     out[6] = 0;
 213     out[7] = 0;
 214     out[8] = 0;
 215 }
 216
 217 static void felem_assign(felem out, const felem in)
 218 {
 219     out[0] = in[0];
 220     out[1] = in[1];
 221     out[2] = in[2];
 222     out[3] = in[3];
 223     out[4] = in[4];
 224     out[5] = in[5];
 225     out[6] = in[6];
 226     out[7] = in[7];
 227     out[8] = in[8];
 228 }
 229
 230 /* felem_sum64 sets out = out + in. */
 231 static void felem_sum64(felem out, const felem in)
 232 {
 233     out[0] += in[0];
 234     out[1] += in[1];
 235     out[2] += in[2];
 236     out[3] += in[3];
 237     out[4] += in[4];
 238     out[5] += in[5];
 239     out[6] += in[6];
 240     out[7] += in[7];
 241     out[8] += in[8];
 242 }
 243
 244 /* felem_scalar sets out = in * scalar */
 245 static void felem_scalar(felem out, const felem in, limb scalar)
 246 {
 247     out[0] = in[0] * scalar;
 248     out[1] = in[1] * scalar;
 249     out[2] = in[2] * scalar;
 250     out[3] = in[3] * scalar;
 251     out[4] = in[4] * scalar;
 252     out[5] = in[5] * scalar;
 253     out[6] = in[6] * scalar;
 254     out[7] = in[7] * scalar;
 255     out[8] = in[8] * scalar;
 256 }
 257
 258 /* felem_scalar64 sets out = out * scalar */
 259 static void felem_scalar64(felem out, limb scalar)
 260 {
 261     out[0] *= scalar;
 262     out[1] *= scalar;
 263     out[2] *= scalar;
 264     out[3] *= scalar;
 265     out[4] *= scalar;
 266     out[5] *= scalar;
 267     out[6] *= scalar;
 268     out[7] *= scalar;
 269     out[8] *= scalar;
 270 }
 271
 272 /* felem_scalar128 sets out = out * scalar */
 273 static void felem_scalar128(largefelem out, limb scalar)
 274 {
 275     out[0] *= scalar;
 276     out[1] *= scalar;
 277     out[2] *= scalar;
 278     out[3] *= scalar;
 279     out[4] *= scalar;
 280     out[5] *= scalar;
 281     out[6] *= scalar;
 282     out[7] *= scalar;
 283     out[8] *= scalar;
 284 }
 285
 286 /*-
 287  * felem_neg sets |out| to |-in|
 288  * On entry:
 289  *   in[i] < 2^59 + 2^14
 290  * On exit:
 291  *   out[i] < 2^62
 292  */
 293 static void felem_neg(felem out, const felem in)
 294 {
 295     /* In order to prevent underflow, we subtract from 0 mod p. */
 296     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 297     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 298
 299     out[0] = two62m3 - in[0];
 300     out[1] = two62m2 - in[1];
 301     out[2] = two62m2 - in[2];
 302     out[3] = two62m2 - in[3];
 303     out[4] = two62m2 - in[4];
 304     out[5] = two62m2 - in[5];
 305     out[6] = two62m2 - in[6];
 306     out[7] = two62m2 - in[7];
 307     out[8] = two62m2 - in[8];
 308 }
 309
 310 /*-
 311  * felem_diff64 subtracts |in| from |out|
 312  * On entry:
 313  *   in[i] < 2^59 + 2^14
 314  * On exit:
 315  *   out[i] < out[i] + 2^62
 316  */
 317 static void felem_diff64(felem out, const felem in)
 318 {
 319     /*
 320      * In order to prevent underflow, we add 0 mod p before subtracting.
 321      */
 322     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 323     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 324
 325     out[0] += two62m3 - in[0];
 326     out[1] += two62m2 - in[1];
 327     out[2] += two62m2 - in[2];
 328     out[3] += two62m2 - in[3];
 329     out[4] += two62m2 - in[4];
 330     out[5] += two62m2 - in[5];
 331     out[6] += two62m2 - in[6];
 332     out[7] += two62m2 - in[7];
 333     out[8] += two62m2 - in[8];
 334 }
 335
 336 /*-
 337  * felem_diff_128_64 subtracts |in| from |out|
 338  * On entry:
 339  *   in[i] < 2^62 + 2^17
 340  * On exit:
 341  *   out[i] < out[i] + 2^63
 342  */
 343 static void felem_diff_128_64(largefelem out, const felem in)
 344 {
 345     /*
 346      * In order to prevent underflow, we add 64p mod p (which is equivalent
 347      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 348      * digit number with all bits set to 1. See "The representation of field
 349      * elements" comment above for a description of how limbs are used to
 350      * represent a number. 64p is represented with 8 limbs containing a number
 351      * with 58 bits set and one limb with a number with 57 bits set.
 352      */
 353     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 354     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 355
 356     out[0] += two63m6 - in[0];
 357     out[1] += two63m5 - in[1];
 358     out[2] += two63m5 - in[2];
 359     out[3] += two63m5 - in[3];
 360     out[4] += two63m5 - in[4];
 361     out[5] += two63m5 - in[5];
 362     out[6] += two63m5 - in[6];
 363     out[7] += two63m5 - in[7];
 364     out[8] += two63m5 - in[8];
 365 }
 366
 367 /*-
 368  * felem_diff_128_64 subtracts |in| from |out|
 369  * On entry:
 370  *   in[i] < 2^126
 371  * On exit:
 372  *   out[i] < out[i] + 2^127 - 2^69
 373  */
 374 static void felem_diff128(largefelem out, const largefelem in)
 375 {
 376     /*
 377      * In order to prevent underflow, we add 0 mod p before subtracting.
 378      */
 379     static const uint128_t two127m70 =
 380         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 381     static const uint128_t two127m69 =
 382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 383
 384     out[0] += (two127m70 - in[0]);
 385     out[1] += (two127m69 - in[1]);
 386     out[2] += (two127m69 - in[2]);
 387     out[3] += (two127m69 - in[3]);
 388     out[4] += (two127m69 - in[4]);
 389     out[5] += (two127m69 - in[5]);
 390     out[6] += (two127m69 - in[6]);
 391     out[7] += (two127m69 - in[7]);
 392     out[8] += (two127m69 - in[8]);
 393 }
 394
 395 /*-
 396  * felem_square sets |out| = |in|^2
 397  * On entry:
 398  *   in[i] < 2^62
 399  * On exit:
 400  *   out[i] < 17 * max(in[i]) * max(in[i])
 401  */
 402 static void felem_square(largefelem out, const felem in)
 403 {
 404     felem inx2, inx4;
 405     felem_scalar(inx2, in, 2);
 406     felem_scalar(inx4, in, 4);
 407
 408     /*-
 409      * We have many cases were we want to do
 410      *   in[x] * in[y] +
 411      *   in[y] * in[x]
 412      * This is obviously just
 413      *   2 * in[x] * in[y]
 414      * However, rather than do the doubling on the 128 bit result, we
 415      * double one of the inputs to the multiplication by reading from
 416      * |inx2|
 417      */
 418
 419     out[0] = ((uint128_t) in[0]) * in[0];
 420     out[1] = ((uint128_t) in[0]) * inx2[1];
 421     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 422     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 423     out[4] = ((uint128_t) in[0]) * inx2[4] +
 424              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 425     out[5] = ((uint128_t) in[0]) * inx2[5] +
 426              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 427     out[6] = ((uint128_t) in[0]) * inx2[6] +
 428              ((uint128_t) in[1]) * inx2[5] +
 429              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 430     out[7] = ((uint128_t) in[0]) * inx2[7] +
 431              ((uint128_t) in[1]) * inx2[6] +
 432              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 433     out[8] = ((uint128_t) in[0]) * inx2[8] +
 434              ((uint128_t) in[1]) * inx2[7] +
 435              ((uint128_t) in[2]) * inx2[6] +
 436              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 437
 438     /*
 439      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 440      * They correspond to locations one bit up from the limbs produced above
 441      * so we would have to multiply by two to align them. Again, rather than
 442      * operate on the 128-bit result, we double one of the inputs to the
 443      * multiplication. If we want to double for both this reason, and the
 444      * reason above, then we end up multiplying by four.
 445      */
 446
 447     /* 9 */
 448     out[0] += ((uint128_t) in[1]) * inx4[8] +
 449               ((uint128_t) in[2]) * inx4[7] +
 450               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 451
 452     /* 10 */
 453     out[1] += ((uint128_t) in[2]) * inx4[8] +
 454               ((uint128_t) in[3]) * inx4[7] +
 455               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 456
 457     /* 11 */
 458     out[2] += ((uint128_t) in[3]) * inx4[8] +
 459               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 460
 461     /* 12 */
 462     out[3] += ((uint128_t) in[4]) * inx4[8] +
 463               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 464
 465     /* 13 */
 466     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 467
 468     /* 14 */
 469     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 470
 471     /* 15 */
 472     out[6] += ((uint128_t) in[7]) * inx4[8];
 473
 474     /* 16 */
 475     out[7] += ((uint128_t) in[8]) * inx2[8];
 476 }
 477
 478 /*-
 479  * felem_mul sets |out| = |in1| * |in2|
 480  * On entry:
 481  *   in1[i] < 2^64
 482  *   in2[i] < 2^63
 483  * On exit:
 484  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 485  */
 486 static void felem_mul(largefelem out, const felem in1, const felem in2)
 487 {
 488     felem in2x2;
 489     felem_scalar(in2x2, in2, 2);
 490
 491     out[0] = ((uint128_t) in1[0]) * in2[0];
 492
 493     out[1] = ((uint128_t) in1[0]) * in2[1] +
 494              ((uint128_t) in1[1]) * in2[0];
 495
 496     out[2] = ((uint128_t) in1[0]) * in2[2] +
 497              ((uint128_t) in1[1]) * in2[1] +
 498              ((uint128_t) in1[2]) * in2[0];
 499
 500     out[3] = ((uint128_t) in1[0]) * in2[3] +
 501              ((uint128_t) in1[1]) * in2[2] +
 502              ((uint128_t) in1[2]) * in2[1] +
 503              ((uint128_t) in1[3]) * in2[0];
 504
 505     out[4] = ((uint128_t) in1[0]) * in2[4] +
 506              ((uint128_t) in1[1]) * in2[3] +
 507              ((uint128_t) in1[2]) * in2[2] +
 508              ((uint128_t) in1[3]) * in2[1] +
 509              ((uint128_t) in1[4]) * in2[0];
 510
 511     out[5] = ((uint128_t) in1[0]) * in2[5] +
 512              ((uint128_t) in1[1]) * in2[4] +
 513              ((uint128_t) in1[2]) * in2[3] +
 514              ((uint128_t) in1[3]) * in2[2] +
 515              ((uint128_t) in1[4]) * in2[1] +
 516              ((uint128_t) in1[5]) * in2[0];
 517
 518     out[6] = ((uint128_t) in1[0]) * in2[6] +
 519              ((uint128_t) in1[1]) * in2[5] +
 520              ((uint128_t) in1[2]) * in2[4] +
 521              ((uint128_t) in1[3]) * in2[3] +
 522              ((uint128_t) in1[4]) * in2[2] +
 523              ((uint128_t) in1[5]) * in2[1] +
 524              ((uint128_t) in1[6]) * in2[0];
 525
 526     out[7] = ((uint128_t) in1[0]) * in2[7] +
 527              ((uint128_t) in1[1]) * in2[6] +
 528              ((uint128_t) in1[2]) * in2[5] +
 529              ((uint128_t) in1[3]) * in2[4] +
 530              ((uint128_t) in1[4]) * in2[3] +
 531              ((uint128_t) in1[5]) * in2[2] +
 532              ((uint128_t) in1[6]) * in2[1] +
 533              ((uint128_t) in1[7]) * in2[0];
 534
 535     out[8] = ((uint128_t) in1[0]) * in2[8] +
 536              ((uint128_t) in1[1]) * in2[7] +
 537              ((uint128_t) in1[2]) * in2[6] +
 538              ((uint128_t) in1[3]) * in2[5] +
 539              ((uint128_t) in1[4]) * in2[4] +
 540              ((uint128_t) in1[5]) * in2[3] +
 541              ((uint128_t) in1[6]) * in2[2] +
 542              ((uint128_t) in1[7]) * in2[1] +
 543              ((uint128_t) in1[8]) * in2[0];
 544
 545     /* See comment in felem_square about the use of in2x2 here */
 546
 547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 548               ((uint128_t) in1[2]) * in2x2[7] +
 549               ((uint128_t) in1[3]) * in2x2[6] +
 550               ((uint128_t) in1[4]) * in2x2[5] +
 551               ((uint128_t) in1[5]) * in2x2[4] +
 552               ((uint128_t) in1[6]) * in2x2[3] +
 553               ((uint128_t) in1[7]) * in2x2[2] +
 554               ((uint128_t) in1[8]) * in2x2[1];
 555
 556     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 557               ((uint128_t) in1[3]) * in2x2[7] +
 558               ((uint128_t) in1[4]) * in2x2[6] +
 559               ((uint128_t) in1[5]) * in2x2[5] +
 560               ((uint128_t) in1[6]) * in2x2[4] +
 561               ((uint128_t) in1[7]) * in2x2[3] +
 562               ((uint128_t) in1[8]) * in2x2[2];
 563
 564     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 565               ((uint128_t) in1[4]) * in2x2[7] +
 566               ((uint128_t) in1[5]) * in2x2[6] +
 567               ((uint128_t) in1[6]) * in2x2[5] +
 568               ((uint128_t) in1[7]) * in2x2[4] +
 569               ((uint128_t) in1[8]) * in2x2[3];
 570
 571     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 572               ((uint128_t) in1[5]) * in2x2[7] +
 573               ((uint128_t) in1[6]) * in2x2[6] +
 574               ((uint128_t) in1[7]) * in2x2[5] +
 575               ((uint128_t) in1[8]) * in2x2[4];
 576
 577     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 578               ((uint128_t) in1[6]) * in2x2[7] +
 579               ((uint128_t) in1[7]) * in2x2[6] +
 580               ((uint128_t) in1[8]) * in2x2[5];
 581
 582     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 583               ((uint128_t) in1[7]) * in2x2[7] +
 584               ((uint128_t) in1[8]) * in2x2[6];
 585
 586     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 587               ((uint128_t) in1[8]) * in2x2[7];
 588
 589     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 590 }
 591
 592 static const limb bottom52bits = 0xfffffffffffff;
 593
 594 /*-
 595  * felem_reduce converts a largefelem to an felem.
 596  * On entry:
 597  *   in[i] < 2^128
 598  * On exit:
 599  *   out[i] < 2^59 + 2^14
 600  */
 601 static void felem_reduce(felem out, const largefelem in)
 602 {
 603     u64 overflow1, overflow2;
 604
 605     out[0] = ((limb) in[0]) & bottom58bits;
 606     out[1] = ((limb) in[1]) & bottom58bits;
 607     out[2] = ((limb) in[2]) & bottom58bits;
 608     out[3] = ((limb) in[3]) & bottom58bits;
 609     out[4] = ((limb) in[4]) & bottom58bits;
 610     out[5] = ((limb) in[5]) & bottom58bits;
 611     out[6] = ((limb) in[6]) & bottom58bits;
 612     out[7] = ((limb) in[7]) & bottom58bits;
 613     out[8] = ((limb) in[8]) & bottom58bits;
 614
 615     /* out[i] < 2^58 */
 616
 617     out[1] += ((limb) in[0]) >> 58;
 618     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 619     /*-
 620      * out[1] < 2^58 + 2^6 + 2^58
 621      *        = 2^59 + 2^6
 622      */
 623     out[2] += ((limb) (in[0] >> 64)) >> 52;
 624
 625     out[2] += ((limb) in[1]) >> 58;
 626     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 627     out[3] += ((limb) (in[1] >> 64)) >> 52;
 628
 629     out[3] += ((limb) in[2]) >> 58;
 630     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 631     out[4] += ((limb) (in[2] >> 64)) >> 52;
 632
 633     out[4] += ((limb) in[3]) >> 58;
 634     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 635     out[5] += ((limb) (in[3] >> 64)) >> 52;
 636
 637     out[5] += ((limb) in[4]) >> 58;
 638     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 639     out[6] += ((limb) (in[4] >> 64)) >> 52;
 640
 641     out[6] += ((limb) in[5]) >> 58;
 642     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 643     out[7] += ((limb) (in[5] >> 64)) >> 52;
 644
 645     out[7] += ((limb) in[6]) >> 58;
 646     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 647     out[8] += ((limb) (in[6] >> 64)) >> 52;
 648
 649     out[8] += ((limb) in[7]) >> 58;
 650     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 651     /*-
 652      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 653      *            < 2^59 + 2^13
 654      */
 655     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 656
 657     overflow1 += ((limb) in[8]) >> 58;
 658     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 659     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 660
 661     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 662     overflow2 <<= 1;            /* overflow2 < 2^13 */
 663
 664     out[0] += overflow1;        /* out[0] < 2^60 */
 665     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 666
 667     out[1] += out[0] >> 58;
 668     out[0] &= bottom58bits;
 669     /*-
 670      * out[0] < 2^58
 671      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 672      *        < 2^59 + 2^14
 673      */
 674 }
 675
 676 static void felem_square_reduce(felem out, const felem in)
 677 {
 678     largefelem tmp;
 679     felem_square(tmp, in);
 680     felem_reduce(out, tmp);
 681 }
 682
 683 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 684 {
 685     largefelem tmp;
 686     felem_mul(tmp, in1, in2);
 687     felem_reduce(out, tmp);
 688 }
 689
 690 /*-
 691  * felem_inv calculates |out| = |in|^{-1}
 692  *
 693  * Based on Fermat's Little Theorem:
 694  *   a^p = a (mod p)
 695  *   a^{p-1} = 1 (mod p)
 696  *   a^{p-2} = a^{-1} (mod p)
 697  */
 698 static void felem_inv(felem out, const felem in)
 699 {
 700     felem ftmp, ftmp2, ftmp3, ftmp4;
 701     largefelem tmp;
 702     unsigned i;
 703
 704     felem_square(tmp, in);
 705     felem_reduce(ftmp, tmp);    /* 2^1 */
 706     felem_mul(tmp, in, ftmp);
 707     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 708     felem_assign(ftmp2, ftmp);
 709     felem_square(tmp, ftmp);
 710     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 711     felem_mul(tmp, in, ftmp);
 712     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 713     felem_square(tmp, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 715
 716     felem_square(tmp, ftmp2);
 717     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 718     felem_square(tmp, ftmp3);
 719     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 720     felem_mul(tmp, ftmp3, ftmp2);
 721     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 722
 723     felem_assign(ftmp2, ftmp3);
 724     felem_square(tmp, ftmp3);
 725     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 726     felem_square(tmp, ftmp3);
 727     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 728     felem_square(tmp, ftmp3);
 729     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 730     felem_square(tmp, ftmp3);
 731     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 732     felem_assign(ftmp4, ftmp3);
 733     felem_mul(tmp, ftmp3, ftmp);
 734     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 735     felem_square(tmp, ftmp4);
 736     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 737     felem_mul(tmp, ftmp3, ftmp2);
 738     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 739     felem_assign(ftmp2, ftmp3);
 740
 741     for (i = 0; i < 8; i++) {
 742         felem_square(tmp, ftmp3);
 743         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 744     }
 745     felem_mul(tmp, ftmp3, ftmp2);
 746     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 747     felem_assign(ftmp2, ftmp3);
 748
 749     for (i = 0; i < 16; i++) {
 750         felem_square(tmp, ftmp3);
 751         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 752     }
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 755     felem_assign(ftmp2, ftmp3);
 756
 757     for (i = 0; i < 32; i++) {
 758         felem_square(tmp, ftmp3);
 759         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 760     }
 761     felem_mul(tmp, ftmp3, ftmp2);
 762     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 763     felem_assign(ftmp2, ftmp3);
 764
 765     for (i = 0; i < 64; i++) {
 766         felem_square(tmp, ftmp3);
 767         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 768     }
 769     felem_mul(tmp, ftmp3, ftmp2);
 770     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 771     felem_assign(ftmp2, ftmp3);
 772
 773     for (i = 0; i < 128; i++) {
 774         felem_square(tmp, ftmp3);
 775         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 776     }
 777     felem_mul(tmp, ftmp3, ftmp2);
 778     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 779     felem_assign(ftmp2, ftmp3);
 780
 781     for (i = 0; i < 256; i++) {
 782         felem_square(tmp, ftmp3);
 783         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 784     }
 785     felem_mul(tmp, ftmp3, ftmp2);
 786     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 787
 788     for (i = 0; i < 9; i++) {
 789         felem_square(tmp, ftmp3);
 790         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 791     }
 792     felem_mul(tmp, ftmp3, ftmp4);
 793     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 794     felem_mul(tmp, ftmp3, in);
 795     felem_reduce(out, tmp);     /* 2^512 - 3 */
 796 }
 797
 798 /* This is 2^521-1, expressed as an felem */
 799 static const felem kPrime = {
 800     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 801     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 802     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 803 };
 804
 805 /*-
 806  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 807  * otherwise.
 808  * On entry:
 809  *   in[i] < 2^59 + 2^14
 810  */
 811 static limb felem_is_zero(const felem in)
 812 {
 813     felem ftmp;
 814     limb is_zero, is_p;
 815     felem_assign(ftmp, in);
 816
 817     ftmp[0] += ftmp[8] >> 57;
 818     ftmp[8] &= bottom57bits;
 819     /* ftmp[8] < 2^57 */
 820     ftmp[1] += ftmp[0] >> 58;
 821     ftmp[0] &= bottom58bits;
 822     ftmp[2] += ftmp[1] >> 58;
 823     ftmp[1] &= bottom58bits;
 824     ftmp[3] += ftmp[2] >> 58;
 825     ftmp[2] &= bottom58bits;
 826     ftmp[4] += ftmp[3] >> 58;
 827     ftmp[3] &= bottom58bits;
 828     ftmp[5] += ftmp[4] >> 58;
 829     ftmp[4] &= bottom58bits;
 830     ftmp[6] += ftmp[5] >> 58;
 831     ftmp[5] &= bottom58bits;
 832     ftmp[7] += ftmp[6] >> 58;
 833     ftmp[6] &= bottom58bits;
 834     ftmp[8] += ftmp[7] >> 58;
 835     ftmp[7] &= bottom58bits;
 836     /* ftmp[8] < 2^57 + 4 */
 837
 838     /*
 839      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 840      * than our bound for ftmp[8]. Therefore we only have to check if the
 841      * zero is zero or 2^521-1.
 842      */
 843
 844     is_zero = 0;
 845     is_zero |= ftmp[0];
 846     is_zero |= ftmp[1];
 847     is_zero |= ftmp[2];
 848     is_zero |= ftmp[3];
 849     is_zero |= ftmp[4];
 850     is_zero |= ftmp[5];
 851     is_zero |= ftmp[6];
 852     is_zero |= ftmp[7];
 853     is_zero |= ftmp[8];
 854
 855     is_zero--;
 856     /*
 857      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 858      * can be set is if is_zero was 0 before the decrement.
 859      */
 860     is_zero = 0 - (is_zero >> 63);
 861
 862     is_p = ftmp[0] ^ kPrime[0];
 863     is_p |= ftmp[1] ^ kPrime[1];
 864     is_p |= ftmp[2] ^ kPrime[2];
 865     is_p |= ftmp[3] ^ kPrime[3];
 866     is_p |= ftmp[4] ^ kPrime[4];
 867     is_p |= ftmp[5] ^ kPrime[5];
 868     is_p |= ftmp[6] ^ kPrime[6];
 869     is_p |= ftmp[7] ^ kPrime[7];
 870     is_p |= ftmp[8] ^ kPrime[8];
 871
 872     is_p--;
 873     is_p = 0 - (is_p >> 63);
 874
 875     is_zero |= is_p;
 876     return is_zero;
 877 }
 878
 879 static int felem_is_zero_int(const void *in)
 880 {
 881     return (int)(felem_is_zero(in) & ((limb) 1));
 882 }
 883
 884 /*-
 885  * felem_contract converts |in| to its unique, minimal representation.
 886  * On entry:
 887  *   in[i] < 2^59 + 2^14
 888  */
 889 static void felem_contract(felem out, const felem in)
 890 {
 891     limb is_p, is_greater, sign;
 892     static const limb two58 = ((limb) 1) << 58;
 893
 894     felem_assign(out, in);
 895
 896     out[0] += out[8] >> 57;
 897     out[8] &= bottom57bits;
 898     /* out[8] < 2^57 */
 899     out[1] += out[0] >> 58;
 900     out[0] &= bottom58bits;
 901     out[2] += out[1] >> 58;
 902     out[1] &= bottom58bits;
 903     out[3] += out[2] >> 58;
 904     out[2] &= bottom58bits;
 905     out[4] += out[3] >> 58;
 906     out[3] &= bottom58bits;
 907     out[5] += out[4] >> 58;
 908     out[4] &= bottom58bits;
 909     out[6] += out[5] >> 58;
 910     out[5] &= bottom58bits;
 911     out[7] += out[6] >> 58;
 912     out[6] &= bottom58bits;
 913     out[8] += out[7] >> 58;
 914     out[7] &= bottom58bits;
 915     /* out[8] < 2^57 + 4 */
 916
 917     /*
 918      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 919      * out. See the comments in felem_is_zero regarding why we don't test for
 920      * other multiples of the prime.
 921      */
 922
 923     /*
 924      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 925      */
 926
 927     is_p = out[0] ^ kPrime[0];
 928     is_p |= out[1] ^ kPrime[1];
 929     is_p |= out[2] ^ kPrime[2];
 930     is_p |= out[3] ^ kPrime[3];
 931     is_p |= out[4] ^ kPrime[4];
 932     is_p |= out[5] ^ kPrime[5];
 933     is_p |= out[6] ^ kPrime[6];
 934     is_p |= out[7] ^ kPrime[7];
 935     is_p |= out[8] ^ kPrime[8];
 936
 937     is_p--;
 938     is_p &= is_p << 32;
 939     is_p &= is_p << 16;
 940     is_p &= is_p << 8;
 941     is_p &= is_p << 4;
 942     is_p &= is_p << 2;
 943     is_p &= is_p << 1;
 944     is_p = 0 - (is_p >> 63);
 945     is_p = ~is_p;
 946
 947     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 948
 949     out[0] &= is_p;
 950     out[1] &= is_p;
 951     out[2] &= is_p;
 952     out[3] &= is_p;
 953     out[4] &= is_p;
 954     out[5] &= is_p;
 955     out[6] &= is_p;
 956     out[7] &= is_p;
 957     out[8] &= is_p;
 958
 959     /*
 960      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 961      * 57 is greater than zero as (2^521-1) + x >= 2^522
 962      */
 963     is_greater = out[8] >> 57;
 964     is_greater |= is_greater << 32;
 965     is_greater |= is_greater << 16;
 966     is_greater |= is_greater << 8;
 967     is_greater |= is_greater << 4;
 968     is_greater |= is_greater << 2;
 969     is_greater |= is_greater << 1;
 970     is_greater = 0 - (is_greater >> 63);
 971
 972     out[0] -= kPrime[0] & is_greater;
 973     out[1] -= kPrime[1] & is_greater;
 974     out[2] -= kPrime[2] & is_greater;
 975     out[3] -= kPrime[3] & is_greater;
 976     out[4] -= kPrime[4] & is_greater;
 977     out[5] -= kPrime[5] & is_greater;
 978     out[6] -= kPrime[6] & is_greater;
 979     out[7] -= kPrime[7] & is_greater;
 980     out[8] -= kPrime[8] & is_greater;
 981
 982     /* Eliminate negative coefficients */
 983     sign = -(out[0] >> 63);
 984     out[0] += (two58 & sign);
 985     out[1] -= (1 & sign);
 986     sign = -(out[1] >> 63);
 987     out[1] += (two58 & sign);
 988     out[2] -= (1 & sign);
 989     sign = -(out[2] >> 63);
 990     out[2] += (two58 & sign);
 991     out[3] -= (1 & sign);
 992     sign = -(out[3] >> 63);
 993     out[3] += (two58 & sign);
 994     out[4] -= (1 & sign);
 995     sign = -(out[4] >> 63);
 996     out[4] += (two58 & sign);
 997     out[5] -= (1 & sign);
 998     sign = -(out[0] >> 63);
 999     out[5] += (two58 & sign);
1000     out[6] -= (1 & sign);
1001     sign = -(out[6] >> 63);
1002     out[6] += (two58 & sign);
1003     out[7] -= (1 & sign);
1004     sign = -(out[7] >> 63);
1005     out[7] += (two58 & sign);
1006     out[8] -= (1 & sign);
1007     sign = -(out[5] >> 63);
1008     out[5] += (two58 & sign);
1009     out[6] -= (1 & sign);
1010     sign = -(out[6] >> 63);
1011     out[6] += (two58 & sign);
1012     out[7] -= (1 & sign);
1013     sign = -(out[7] >> 63);
1014     out[7] += (two58 & sign);
1015     out[8] -= (1 & sign);
1016 }
1017
1018 /*-
1019  * Group operations
1020  * ----------------
1021  *
1022  * Building on top of the field operations we have the operations on the
1023  * elliptic curve group itself. Points on the curve are represented in Jacobian
1024  * coordinates */
1025
1026 /*-
1027  * point_double calculates 2*(x_in, y_in, z_in)
1028  *
1029  * The method is taken from:
1030  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1031  *
1032  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1033  * while x_out == y_in is not (maybe this works, but it's not tested). */
1034 static void
1035 point_double(felem x_out, felem y_out, felem z_out,
1036              const felem x_in, const felem y_in, const felem z_in)
1037 {
1038     largefelem tmp, tmp2;
1039     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1040
1041     felem_assign(ftmp, x_in);
1042     felem_assign(ftmp2, x_in);
1043
1044     /* delta = z^2 */
1045     felem_square(tmp, z_in);
1046     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1047
1048     /* gamma = y^2 */
1049     felem_square(tmp, y_in);
1050     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1051
1052     /* beta = x*gamma */
1053     felem_mul(tmp, x_in, gamma);
1054     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1055
1056     /* alpha = 3*(x-delta)*(x+delta) */
1057     felem_diff64(ftmp, delta);
1058     /* ftmp[i] < 2^61 */
1059     felem_sum64(ftmp2, delta);
1060     /* ftmp2[i] < 2^60 + 2^15 */
1061     felem_scalar64(ftmp2, 3);
1062     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1063     felem_mul(tmp, ftmp, ftmp2);
1064     /*-
1065      * tmp[i] < 17(3*2^121 + 3*2^76)
1066      *        = 61*2^121 + 61*2^76
1067      *        < 64*2^121 + 64*2^76
1068      *        = 2^127 + 2^82
1069      *        < 2^128
1070      */
1071     felem_reduce(alpha, tmp);
1072
1073     /* x' = alpha^2 - 8*beta */
1074     felem_square(tmp, alpha);
1075     /*
1076      * tmp[i] < 17*2^120 < 2^125
1077      */
1078     felem_assign(ftmp, beta);
1079     felem_scalar64(ftmp, 8);
1080     /* ftmp[i] < 2^62 + 2^17 */
1081     felem_diff_128_64(tmp, ftmp);
1082     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1083     felem_reduce(x_out, tmp);
1084
1085     /* z' = (y + z)^2 - gamma - delta */
1086     felem_sum64(delta, gamma);
1087     /* delta[i] < 2^60 + 2^15 */
1088     felem_assign(ftmp, y_in);
1089     felem_sum64(ftmp, z_in);
1090     /* ftmp[i] < 2^60 + 2^15 */
1091     felem_square(tmp, ftmp);
1092     /*
1093      * tmp[i] < 17(2^122) < 2^127
1094      */
1095     felem_diff_128_64(tmp, delta);
1096     /* tmp[i] < 2^127 + 2^63 */
1097     felem_reduce(z_out, tmp);
1098
1099     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1100     felem_scalar64(beta, 4);
1101     /* beta[i] < 2^61 + 2^16 */
1102     felem_diff64(beta, x_out);
1103     /* beta[i] < 2^61 + 2^60 + 2^16 */
1104     felem_mul(tmp, alpha, beta);
1105     /*-
1106      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1107      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1108      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1109      *        < 2^128
1110      */
1111     felem_square(tmp2, gamma);
1112     /*-
1113      * tmp2[i] < 17*(2^59 + 2^14)^2
1114      *         = 17*(2^118 + 2^74 + 2^28)
1115      */
1116     felem_scalar128(tmp2, 8);
1117     /*-
1118      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1119      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1120      *         < 2^126
1121      */
1122     felem_diff128(tmp, tmp2);
1123     /*-
1124      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1126      *          2^74 + 2^69 + 2^34 + 2^30
1127      *        < 2^128
1128      */
1129     felem_reduce(y_out, tmp);
1130 }
1131
1132 /* copy_conditional copies in to out iff mask is all ones. */
1133 static void copy_conditional(felem out, const felem in, limb mask)
1134 {
1135     unsigned i;
1136     for (i = 0; i < NLIMBS; ++i) {
1137         const limb tmp = mask & (in[i] ^ out[i]);
1138         out[i] ^= tmp;
1139     }
1140 }
1141
1142 /*-
1143  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1144  *
1145  * The method is taken from
1146  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1147  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1148  *
1149  * This function includes a branch for checking whether the two input points
1150  * are equal (while not equal to the point at infinity). See comment below
1151  * on constant-time.
1152  */
1153 static void point_add(felem x3, felem y3, felem z3,
1154                       const felem x1, const felem y1, const felem z1,
1155                       const int mixed, const felem x2, const felem y2,
1156                       const felem z2)
1157 {
1158     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1159     largefelem tmp, tmp2;
1160     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1161     limb points_equal;
1162
1163     z1_is_zero = felem_is_zero(z1);
1164     z2_is_zero = felem_is_zero(z2);
1165
1166     /* ftmp = z1z1 = z1**2 */
1167     felem_square(tmp, z1);
1168     felem_reduce(ftmp, tmp);
1169
1170     if (!mixed) {
1171         /* ftmp2 = z2z2 = z2**2 */
1172         felem_square(tmp, z2);
1173         felem_reduce(ftmp2, tmp);
1174
1175         /* u1 = ftmp3 = x1*z2z2 */
1176         felem_mul(tmp, x1, ftmp2);
1177         felem_reduce(ftmp3, tmp);
1178
1179         /* ftmp5 = z1 + z2 */
1180         felem_assign(ftmp5, z1);
1181         felem_sum64(ftmp5, z2);
1182         /* ftmp5[i] < 2^61 */
1183
1184         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1185         felem_square(tmp, ftmp5);
1186         /* tmp[i] < 17*2^122 */
1187         felem_diff_128_64(tmp, ftmp);
1188         /* tmp[i] < 17*2^122 + 2^63 */
1189         felem_diff_128_64(tmp, ftmp2);
1190         /* tmp[i] < 17*2^122 + 2^64 */
1191         felem_reduce(ftmp5, tmp);
1192
1193         /* ftmp2 = z2 * z2z2 */
1194         felem_mul(tmp, ftmp2, z2);
1195         felem_reduce(ftmp2, tmp);
1196
1197         /* s1 = ftmp6 = y1 * z2**3 */
1198         felem_mul(tmp, y1, ftmp2);
1199         felem_reduce(ftmp6, tmp);
1200     } else {
1201         /*
1202          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1203          */
1204
1205         /* u1 = ftmp3 = x1*z2z2 */
1206         felem_assign(ftmp3, x1);
1207
1208         /* ftmp5 = 2*z1z2 */
1209         felem_scalar(ftmp5, z1, 2);
1210
1211         /* s1 = ftmp6 = y1 * z2**3 */
1212         felem_assign(ftmp6, y1);
1213     }
1214
1215     /* u2 = x2*z1z1 */
1216     felem_mul(tmp, x2, ftmp);
1217     /* tmp[i] < 17*2^120 */
1218
1219     /* h = ftmp4 = u2 - u1 */
1220     felem_diff_128_64(tmp, ftmp3);
1221     /* tmp[i] < 17*2^120 + 2^63 */
1222     felem_reduce(ftmp4, tmp);
1223
1224     x_equal = felem_is_zero(ftmp4);
1225
1226     /* z_out = ftmp5 * h */
1227     felem_mul(tmp, ftmp5, ftmp4);
1228     felem_reduce(z_out, tmp);
1229
1230     /* ftmp = z1 * z1z1 */
1231     felem_mul(tmp, ftmp, z1);
1232     felem_reduce(ftmp, tmp);
1233
1234     /* s2 = tmp = y2 * z1**3 */
1235     felem_mul(tmp, y2, ftmp);
1236     /* tmp[i] < 17*2^120 */
1237
1238     /* r = ftmp5 = (s2 - s1)*2 */
1239     felem_diff_128_64(tmp, ftmp6);
1240     /* tmp[i] < 17*2^120 + 2^63 */
1241     felem_reduce(ftmp5, tmp);
1242     y_equal = felem_is_zero(ftmp5);
1243     felem_scalar64(ftmp5, 2);
1244     /* ftmp5[i] < 2^61 */
1245
1246     /*
1247      * The formulae are incorrect if the points are equal, in affine coordinates
1248      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1249      * happens.
1250      *
1251      * We use bitwise operations to avoid potential side-channels introduced by
1252      * the short-circuiting behaviour of boolean operators.
1253      *
1254      * The special case of either point being the point at infinity (z1 and/or
1255      * z2 are zero), is handled separately later on in this function, so we
1256      * avoid jumping to point_double here in those special cases.
1257      *
1258      * Notice the comment below on the implications of this branching for timing
1259      * leaks and why it is considered practically irrelevant.
1260      */
1261     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1262
1263     if (points_equal) {
1264         /*
1265          * This is obviously not constant-time but it will almost-never happen
1266          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1267          * where the intermediate value gets very close to the group order.
1268          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1269          * the scalar, it's possible for the intermediate value to be a small
1270          * negative multiple of the base point, and for the final signed digit
1271          * to be the same value. We believe that this only occurs for the scalar
1272          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1273          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1274          * 71e913863f7, in that case the penultimate intermediate is -9G and
1275          * the final digit is also -9G. Since this only happens for a single
1276          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1277          * check whether a secret scalar was that exact value, can already do
1278          * so.)
1279          */
1280         point_double(x3, y3, z3, x1, y1, z1);
1281         return;
1282     }
1283
1284     /* I = ftmp = (2h)**2 */
1285     felem_assign(ftmp, ftmp4);
1286     felem_scalar64(ftmp, 2);
1287     /* ftmp[i] < 2^61 */
1288     felem_square(tmp, ftmp);
1289     /* tmp[i] < 17*2^122 */
1290     felem_reduce(ftmp, tmp);
1291
1292     /* J = ftmp2 = h * I */
1293     felem_mul(tmp, ftmp4, ftmp);
1294     felem_reduce(ftmp2, tmp);
1295
1296     /* V = ftmp4 = U1 * I */
1297     felem_mul(tmp, ftmp3, ftmp);
1298     felem_reduce(ftmp4, tmp);
1299
1300     /* x_out = r**2 - J - 2V */
1301     felem_square(tmp, ftmp5);
1302     /* tmp[i] < 17*2^122 */
1303     felem_diff_128_64(tmp, ftmp2);
1304     /* tmp[i] < 17*2^122 + 2^63 */
1305     felem_assign(ftmp3, ftmp4);
1306     felem_scalar64(ftmp4, 2);
1307     /* ftmp4[i] < 2^61 */
1308     felem_diff_128_64(tmp, ftmp4);
1309     /* tmp[i] < 17*2^122 + 2^64 */
1310     felem_reduce(x_out, tmp);
1311
1312     /* y_out = r(V-x_out) - 2 * s1 * J */
1313     felem_diff64(ftmp3, x_out);
1314     /*
1315      * ftmp3[i] < 2^60 + 2^60 = 2^61
1316      */
1317     felem_mul(tmp, ftmp5, ftmp3);
1318     /* tmp[i] < 17*2^122 */
1319     felem_mul(tmp2, ftmp6, ftmp2);
1320     /* tmp2[i] < 17*2^120 */
1321     felem_scalar128(tmp2, 2);
1322     /* tmp2[i] < 17*2^121 */
1323     felem_diff128(tmp, tmp2);
1324         /*-
1325          * tmp[i] < 2^127 - 2^69 + 17*2^122
1326          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1327          *        < 2^127
1328          */
1329     felem_reduce(y_out, tmp);
1330
1331     copy_conditional(x_out, x2, z1_is_zero);
1332     copy_conditional(x_out, x1, z2_is_zero);
1333     copy_conditional(y_out, y2, z1_is_zero);
1334     copy_conditional(y_out, y1, z2_is_zero);
1335     copy_conditional(z_out, z2, z1_is_zero);
1336     copy_conditional(z_out, z1, z2_is_zero);
1337     felem_assign(x3, x_out);
1338     felem_assign(y3, y_out);
1339     felem_assign(z3, z_out);
1340 }
1341
1342 /*-
1343  * Base point pre computation
1344  * --------------------------
1345  *
1346  * Two different sorts of precomputed tables are used in the following code.
1347  * Each contain various points on the curve, where each point is three field
1348  * elements (x, y, z).
1349  *
1350  * For the base point table, z is usually 1 (0 for the point at infinity).
1351  * This table has 16 elements:
1352  * index | bits    | point
1353  * ------+---------+------------------------------
1354  *     0 | 0 0 0 0 | 0G
1355  *     1 | 0 0 0 1 | 1G
1356  *     2 | 0 0 1 0 | 2^130G
1357  *     3 | 0 0 1 1 | (2^130 + 1)G
1358  *     4 | 0 1 0 0 | 2^260G
1359  *     5 | 0 1 0 1 | (2^260 + 1)G
1360  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1361  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1362  *     8 | 1 0 0 0 | 2^390G
1363  *     9 | 1 0 0 1 | (2^390 + 1)G
1364  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1365  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1366  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1367  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1368  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1369  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1370  *
1371  * The reason for this is so that we can clock bits into four different
1372  * locations when doing simple scalar multiplies against the base point.
1373  *
1374  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1375
1376 /* gmul is the table of precomputed base points */
1377 static const felem gmul[16][3] = {
1378 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1379  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1380  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1381 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1382   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1383   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1384  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1385   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1386   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1387  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1388 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1389   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1390   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1391  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1392   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1393   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1394  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1395 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1396   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1397   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1398  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1399   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1400   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1401  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1402 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1403   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1404   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1405  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1406   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1407   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1408  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1409 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1410   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1411   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1412  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1413   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1414   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1415  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1416 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1417   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1418   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1419  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1420   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1421   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1422  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1423 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1424   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1425   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1426  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1427   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1428   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1429  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1430 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1431   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1432   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1433  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1434   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1435   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1436  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1437 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1438   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1439   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1440  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1441   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1442   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1443  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1444 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1445   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1446   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1447  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1448   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1449   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1450  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1451 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1452   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1453   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1454  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1455   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1456   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1457  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1458 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1459   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1460   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1461  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1462   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1463   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1464  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1465 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1466   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1467   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1468  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1469   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1470   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1471  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1472 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1473   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1474   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1475  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1476   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1477   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1478  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1479 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1480   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1481   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1482  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1483   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1484   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1485  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1486 };
1487
1488 /*
1489  * select_point selects the |idx|th point from a precomputation table and
1490  * copies it to out.
1491  */
1492  /* pre_comp below is of the size provided in |size| */
1493 static void select_point(const limb idx, unsigned int size,
1494                          const felem pre_comp[][3], felem out[3])
1495 {
1496     unsigned i, j;
1497     limb *outlimbs = &out[0][0];
1498
1499     memset(out, 0, sizeof(*out) * 3);
1500
1501     for (i = 0; i < size; i++) {
1502         const limb *inlimbs = &pre_comp[i][0][0];
1503         limb mask = i ^ idx;
1504         mask |= mask >> 4;
1505         mask |= mask >> 2;
1506         mask |= mask >> 1;
1507         mask &= 1;
1508         mask--;
1509         for (j = 0; j < NLIMBS * 3; j++)
1510             outlimbs[j] |= inlimbs[j] & mask;
1511     }
1512 }
1513
1514 /* get_bit returns the |i|th bit in |in| */
1515 static char get_bit(const felem_bytearray in, int i)
1516 {
1517     if (i < 0)
1518         return 0;
1519     return (in[i >> 3] >> (i & 7)) & 1;
1520 }
1521
1522 /*
1523  * Interleaved point multiplication using precomputed point multiples: The
1524  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1525  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1526  * generator, using certain (large) precomputed multiples in g_pre_comp.
1527  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1528  */
1529 static void batch_mul(felem x_out, felem y_out, felem z_out,
1530                       const felem_bytearray scalars[],
1531                       const unsigned num_points, const u8 *g_scalar,
1532                       const int mixed, const felem pre_comp[][17][3],
1533                       const felem g_pre_comp[16][3])
1534 {
1535     int i, skip;
1536     unsigned num, gen_mul = (g_scalar != NULL);
1537     felem nq[3], tmp[4];
1538     limb bits;
1539     u8 sign, digit;
1540
1541     /* set nq to the point at infinity */
1542     memset(nq, 0, sizeof(nq));
1543
1544     /*
1545      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1546      * of the generator (last quarter of rounds) and additions of other
1547      * points multiples (every 5th round).
1548      */
1549     skip = 1;                   /* save two point operations in the first
1550                                  * round */
1551     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1552         /* double */
1553         if (!skip)
1554             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1555
1556         /* add multiples of the generator */
1557         if (gen_mul && (i <= 130)) {
1558             bits = get_bit(g_scalar, i + 390) << 3;
1559             if (i < 130) {
1560                 bits |= get_bit(g_scalar, i + 260) << 2;
1561                 bits |= get_bit(g_scalar, i + 130) << 1;
1562                 bits |= get_bit(g_scalar, i);
1563             }
1564             /* select the point to add, in constant time */
1565             select_point(bits, 16, g_pre_comp, tmp);
1566             if (!skip) {
1567                 /* The 1 argument below is for "mixed" */
1568                 point_add(nq[0], nq[1], nq[2],
1569                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1570             } else {
1571                 memcpy(nq, tmp, 3 * sizeof(felem));
1572                 skip = 0;
1573             }
1574         }
1575
1576         /* do other additions every 5 doublings */
1577         if (num_points && (i % 5 == 0)) {
1578             /* loop over all scalars */
1579             for (num = 0; num < num_points; ++num) {
1580                 bits = get_bit(scalars[num], i + 4) << 5;
1581                 bits |= get_bit(scalars[num], i + 3) << 4;
1582                 bits |= get_bit(scalars[num], i + 2) << 3;
1583                 bits |= get_bit(scalars[num], i + 1) << 2;
1584                 bits |= get_bit(scalars[num], i) << 1;
1585                 bits |= get_bit(scalars[num], i - 1);
1586                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1587
1588                 /*
1589                  * select the point to add or subtract, in constant time
1590                  */
1591                 select_point(digit, 17, pre_comp[num], tmp);
1592                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1593                                             * point */
1594                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1595
1596                 if (!skip) {
1597                     point_add(nq[0], nq[1], nq[2],
1598                               nq[0], nq[1], nq[2],
1599                               mixed, tmp[0], tmp[1], tmp[2]);
1600                 } else {
1601                     memcpy(nq, tmp, 3 * sizeof(felem));
1602                     skip = 0;
1603                 }
1604             }
1605         }
1606     }
1607     felem_assign(x_out, nq[0]);
1608     felem_assign(y_out, nq[1]);
1609     felem_assign(z_out, nq[2]);
1610 }
1611
1612 /* Precomputation for the group generator. */
1613 struct nistp521_pre_comp_st {
1614     felem g_pre_comp[16][3];
1615     CRYPTO_REF_COUNT references;
1616     CRYPTO_RWLOCK *lock;
1617 };
1618
1619 const EC_METHOD *EC_GFp_nistp521_method(void)
1620 {
1621     static const EC_METHOD ret = {
1622         EC_FLAGS_DEFAULT_OCT,
1623         NID_X9_62_prime_field,
1624         ec_GFp_nistp521_group_init,
1625         ec_GFp_simple_group_finish,
1626         ec_GFp_simple_group_clear_finish,
1627         ec_GFp_nist_group_copy,
1628         ec_GFp_nistp521_group_set_curve,
1629         ec_GFp_simple_group_get_curve,
1630         ec_GFp_simple_group_get_degree,
1631         ec_group_simple_order_bits,
1632         ec_GFp_simple_group_check_discriminant,
1633         ec_GFp_simple_point_init,
1634         ec_GFp_simple_point_finish,
1635         ec_GFp_simple_point_clear_finish,
1636         ec_GFp_simple_point_copy,
1637         ec_GFp_simple_point_set_to_infinity,
1638         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1639         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1640         ec_GFp_simple_point_set_affine_coordinates,
1641         ec_GFp_nistp521_point_get_affine_coordinates,
1642         0 /* point_set_compressed_coordinates */ ,
1643         0 /* point2oct */ ,
1644         0 /* oct2point */ ,
1645         ec_GFp_simple_add,
1646         ec_GFp_simple_dbl,
1647         ec_GFp_simple_invert,
1648         ec_GFp_simple_is_at_infinity,
1649         ec_GFp_simple_is_on_curve,
1650         ec_GFp_simple_cmp,
1651         ec_GFp_simple_make_affine,
1652         ec_GFp_simple_points_make_affine,
1653         ec_GFp_nistp521_points_mul,
1654         ec_GFp_nistp521_precompute_mult,
1655         ec_GFp_nistp521_have_precompute_mult,
1656         ec_GFp_nist_field_mul,
1657         ec_GFp_nist_field_sqr,
1658         0 /* field_div */ ,
1659         ec_GFp_simple_field_inv,
1660         0 /* field_encode */ ,
1661         0 /* field_decode */ ,
1662         0,                      /* field_set_to_one */
1663         ec_key_simple_priv2oct,
1664         ec_key_simple_oct2priv,
1665         0, /* set private */
1666         ec_key_simple_generate_key,
1667         ec_key_simple_check_key,
1668         ec_key_simple_generate_public_key,
1669         0, /* keycopy */
1670         0, /* keyfinish */
1671         ecdh_simple_compute_key,
1672         0, /* field_inverse_mod_ord */
1673         0, /* blind_coordinates */
1674         0, /* ladder_pre */
1675         0, /* ladder_step */
1676         0  /* ladder_post */
1677     };
1678
1679     return &ret;
1680 }
1681
1682 /******************************************************************************/
1683 /*
1684  * FUNCTIONS TO MANAGE PRECOMPUTATION
1685  */
1686
1687 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1688 {
1689     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1690
1691     if (ret == NULL) {
1692         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1693         return ret;
1694     }
1695
1696     ret->references = 1;
1697
1698     ret->lock = CRYPTO_THREAD_lock_new();
1699     if (ret->lock == NULL) {
1700         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1701         OPENSSL_free(ret);
1702         return NULL;
1703     }
1704     return ret;
1705 }
1706
1707 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1708 {
1709     int i;
1710     if (p != NULL)
1711         CRYPTO_UP_REF(&p->references, &i, p->lock);
1712     return p;
1713 }
1714
1715 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1716 {
1717     int i;
1718
1719     if (p == NULL)
1720         return;
1721
1722     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1723     REF_PRINT_COUNT("EC_nistp521", x);
1724     if (i > 0)
1725         return;
1726     REF_ASSERT_ISNT(i < 0);
1727
1728     CRYPTO_THREAD_lock_free(p->lock);
1729     OPENSSL_free(p);
1730 }
1731
1732 /******************************************************************************/
1733 /*
1734  * OPENSSL EC_METHOD FUNCTIONS
1735  */
1736
1737 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1738 {
1739     int ret;
1740     ret = ec_GFp_simple_group_init(group);
1741     group->a_is_minus3 = 1;
1742     return ret;
1743 }
1744
1745 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1746                                     const BIGNUM *a, const BIGNUM *b,
1747                                     BN_CTX *ctx)
1748 {
1749     int ret = 0;
1750     BN_CTX *new_ctx = NULL;
1751     BIGNUM *curve_p, *curve_a, *curve_b;
1752
1753     if (ctx == NULL)
1754         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1755             return 0;
1756     BN_CTX_start(ctx);
1757     curve_p = BN_CTX_get(ctx);
1758     curve_a = BN_CTX_get(ctx);
1759     curve_b = BN_CTX_get(ctx);
1760     if (curve_b == NULL)
1761         goto err;
1762     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1763     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1764     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1765     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1766         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1767               EC_R_WRONG_CURVE_PARAMETERS);
1768         goto err;
1769     }
1770     group->field_mod_func = BN_nist_mod_521;
1771     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1772  err:
1773     BN_CTX_end(ctx);
1774     BN_CTX_free(new_ctx);
1775     return ret;
1776 }
1777
1778 /*
1779  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1780  * (X/Z^2, Y/Z^3)
1781  */
1782 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1783                                                  const EC_POINT *point,
1784                                                  BIGNUM *x, BIGNUM *y,
1785                                                  BN_CTX *ctx)
1786 {
1787     felem z1, z2, x_in, y_in, x_out, y_out;
1788     largefelem tmp;
1789
1790     if (EC_POINT_is_at_infinity(group, point)) {
1791         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1792               EC_R_POINT_AT_INFINITY);
1793         return 0;
1794     }
1795     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1796         (!BN_to_felem(z1, point->Z)))
1797         return 0;
1798     felem_inv(z2, z1);
1799     felem_square(tmp, z2);
1800     felem_reduce(z1, tmp);
1801     felem_mul(tmp, x_in, z1);
1802     felem_reduce(x_in, tmp);
1803     felem_contract(x_out, x_in);
1804     if (x != NULL) {
1805         if (!felem_to_BN(x, x_out)) {
1806             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1807                   ERR_R_BN_LIB);
1808             return 0;
1809         }
1810     }
1811     felem_mul(tmp, z1, z2);
1812     felem_reduce(z1, tmp);
1813     felem_mul(tmp, y_in, z1);
1814     felem_reduce(y_in, tmp);
1815     felem_contract(y_out, y_in);
1816     if (y != NULL) {
1817         if (!felem_to_BN(y, y_out)) {
1818             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1819                   ERR_R_BN_LIB);
1820             return 0;
1821         }
1822     }
1823     return 1;
1824 }
1825
1826 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1827 static void make_points_affine(size_t num, felem points[][3],
1828                                felem tmp_felems[])
1829 {
1830     /*
1831      * Runs in constant time, unless an input is the point at infinity (which
1832      * normally shouldn't happen).
1833      */
1834     ec_GFp_nistp_points_make_affine_internal(num,
1835                                              points,
1836                                              sizeof(felem),
1837                                              tmp_felems,
1838                                              (void (*)(void *))felem_one,
1839                                              felem_is_zero_int,
1840                                              (void (*)(void *, const void *))
1841                                              felem_assign,
1842                                              (void (*)(void *, const void *))
1843                                              felem_square_reduce, (void (*)
1844                                                                    (void *,
1845                                                                     const void
1846                                                                     *,
1847                                                                     const void
1848                                                                     *))
1849                                              felem_mul_reduce,
1850                                              (void (*)(void *, const void *))
1851                                              felem_inv,
1852                                              (void (*)(void *, const void *))
1853                                              felem_contract);
1854 }
1855
1856 /*
1857  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1858  * values Result is stored in r (r can equal one of the inputs).
1859  */
1860 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1861                                const BIGNUM *scalar, size_t num,
1862                                const EC_POINT *points[],
1863                                const BIGNUM *scalars[], BN_CTX *ctx)
1864 {
1865     int ret = 0;
1866     int j;
1867     int mixed = 0;
1868     BIGNUM *x, *y, *z, *tmp_scalar;
1869     felem_bytearray g_secret;
1870     felem_bytearray *secrets = NULL;
1871     felem (*pre_comp)[17][3] = NULL;
1872     felem *tmp_felems = NULL;
1873     unsigned i;
1874     int num_bytes;
1875     int have_pre_comp = 0;
1876     size_t num_points = num;
1877     felem x_in, y_in, z_in, x_out, y_out, z_out;
1878     NISTP521_PRE_COMP *pre = NULL;
1879     felem(*g_pre_comp)[3] = NULL;
1880     EC_POINT *generator = NULL;
1881     const EC_POINT *p = NULL;
1882     const BIGNUM *p_scalar = NULL;
1883
1884     BN_CTX_start(ctx);
1885     x = BN_CTX_get(ctx);
1886     y = BN_CTX_get(ctx);
1887     z = BN_CTX_get(ctx);
1888     tmp_scalar = BN_CTX_get(ctx);
1889     if (tmp_scalar == NULL)
1890         goto err;
1891
1892     if (scalar != NULL) {
1893         pre = group->pre_comp.nistp521;
1894         if (pre)
1895             /* we have precomputation, try to use it */
1896             g_pre_comp = &pre->g_pre_comp[0];
1897         else
1898             /* try to use the standard precomputation */
1899             g_pre_comp = (felem(*)[3]) gmul;
1900         generator = EC_POINT_new(group);
1901         if (generator == NULL)
1902             goto err;
1903         /* get the generator from precomputation */
1904         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1905             !felem_to_BN(y, g_pre_comp[1][1]) ||
1906             !felem_to_BN(z, g_pre_comp[1][2])) {
1907             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1908             goto err;
1909         }
1910         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1911                                                       generator, x, y, z,
1912                                                       ctx))
1913             goto err;
1914         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1915             /* precomputation matches generator */
1916             have_pre_comp = 1;
1917         else
1918             /*
1919              * we don't have valid precomputation: treat the generator as a
1920              * random point
1921              */
1922             num_points++;
1923     }
1924
1925     if (num_points > 0) {
1926         if (num_points >= 2) {
1927             /*
1928              * unless we precompute multiples for just one point, converting
1929              * those into affine form is time well spent
1930              */
1931             mixed = 1;
1932         }
1933         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1934         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1935         if (mixed)
1936             tmp_felems =
1937                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1938         if ((secrets == NULL) || (pre_comp == NULL)
1939             || (mixed && (tmp_felems == NULL))) {
1940             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1941             goto err;
1942         }
1943
1944         /*
1945          * we treat NULL scalars as 0, and NULL points as points at infinity,
1946          * i.e., they contribute nothing to the linear combination
1947          */
1948         for (i = 0; i < num_points; ++i) {
1949             if (i == num) {
1950                 /*
1951                  * we didn't have a valid precomputation, so we pick the
1952                  * generator
1953                  */
1954                 p = EC_GROUP_get0_generator(group);
1955                 p_scalar = scalar;
1956             } else {
1957                 /* the i^th point */
1958                 p = points[i];
1959                 p_scalar = scalars[i];
1960             }
1961             if ((p_scalar != NULL) && (p != NULL)) {
1962                 /* reduce scalar to 0 <= scalar < 2^521 */
1963                 if ((BN_num_bits(p_scalar) > 521)
1964                     || (BN_is_negative(p_scalar))) {
1965                     /*
1966                      * this is an unusual input, and we don't guarantee
1967                      * constant-timeness
1968                      */
1969                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1970                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1971                         goto err;
1972                     }
1973                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1974                                                secrets[i], sizeof(secrets[i]));
1975                 } else {
1976                     num_bytes = BN_bn2lebinpad(p_scalar,
1977                                                secrets[i], sizeof(secrets[i]));
1978                 }
1979                 if (num_bytes < 0) {
1980                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1981                     goto err;
1982                 }
1983                 /* precompute multiples */
1984                 if ((!BN_to_felem(x_out, p->X)) ||
1985                     (!BN_to_felem(y_out, p->Y)) ||
1986                     (!BN_to_felem(z_out, p->Z)))
1987                     goto err;
1988                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1989                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1990                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1991                 for (j = 2; j <= 16; ++j) {
1992                     if (j & 1) {
1993                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1994                                   pre_comp[i][j][2], pre_comp[i][1][0],
1995                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1996                                   pre_comp[i][j - 1][0],
1997                                   pre_comp[i][j - 1][1],
1998                                   pre_comp[i][j - 1][2]);
1999                     } else {
2000                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2001                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2002                                      pre_comp[i][j / 2][1],
2003                                      pre_comp[i][j / 2][2]);
2004                     }
2005                 }
2006             }
2007         }
2008         if (mixed)
2009             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2010     }
2011
2012     /* the scalar for the generator */
2013     if ((scalar != NULL) && (have_pre_comp)) {
2014         memset(g_secret, 0, sizeof(g_secret));
2015         /* reduce scalar to 0 <= scalar < 2^521 */
2016         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2017             /*
2018              * this is an unusual input, and we don't guarantee
2019              * constant-timeness
2020              */
2021             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2022                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2023                 goto err;
2024             }
2025             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2026         } else {
2027             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2028         }
2029         /* do the multiplication with generator precomputation */
2030         batch_mul(x_out, y_out, z_out,
2031                   (const felem_bytearray(*))secrets, num_points,
2032                   g_secret,
2033                   mixed, (const felem(*)[17][3])pre_comp,
2034                   (const felem(*)[3])g_pre_comp);
2035     } else {
2036         /* do the multiplication without generator precomputation */
2037         batch_mul(x_out, y_out, z_out,
2038                   (const felem_bytearray(*))secrets, num_points,
2039                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2040     }
2041     /* reduce the output to its unique minimal representation */
2042     felem_contract(x_in, x_out);
2043     felem_contract(y_in, y_out);
2044     felem_contract(z_in, z_out);
2045     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2046         (!felem_to_BN(z, z_in))) {
2047         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2048         goto err;
2049     }
2050     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2051
2052  err:
2053     BN_CTX_end(ctx);
2054     EC_POINT_free(generator);
2055     OPENSSL_free(secrets);
2056     OPENSSL_free(pre_comp);
2057     OPENSSL_free(tmp_felems);
2058     return ret;
2059 }
2060
2061 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2062 {
2063     int ret = 0;
2064     NISTP521_PRE_COMP *pre = NULL;
2065     int i, j;
2066     BN_CTX *new_ctx = NULL;
2067     BIGNUM *x, *y;
2068     EC_POINT *generator = NULL;
2069     felem tmp_felems[16];
2070
2071     /* throw away old precomputation */
2072     EC_pre_comp_free(group);
2073     if (ctx == NULL)
2074         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2075             return 0;
2076     BN_CTX_start(ctx);
2077     x = BN_CTX_get(ctx);
2078     y = BN_CTX_get(ctx);
2079     if (y == NULL)
2080         goto err;
2081     /* get the generator */
2082     if (group->generator == NULL)
2083         goto err;
2084     generator = EC_POINT_new(group);
2085     if (generator == NULL)
2086         goto err;
2087     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2088     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2089     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2090         goto err;
2091     if ((pre = nistp521_pre_comp_new()) == NULL)
2092         goto err;
2093     /*
2094      * if the generator is the standard one, use built-in precomputation
2095      */
2096     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2097         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2098         goto done;
2099     }
2100     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2101         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2102         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2103         goto err;
2104     /* compute 2^130*G, 2^260*G, 2^390*G */
2105     for (i = 1; i <= 4; i <<= 1) {
2106         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2107                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2108                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2109         for (j = 0; j < 129; ++j) {
2110             point_double(pre->g_pre_comp[2 * i][0],
2111                          pre->g_pre_comp[2 * i][1],
2112                          pre->g_pre_comp[2 * i][2],
2113                          pre->g_pre_comp[2 * i][0],
2114                          pre->g_pre_comp[2 * i][1],
2115                          pre->g_pre_comp[2 * i][2]);
2116         }
2117     }
2118     /* g_pre_comp[0] is the point at infinity */
2119     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2120     /* the remaining multiples */
2121     /* 2^130*G + 2^260*G */
2122     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2123               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2124               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2125               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2126               pre->g_pre_comp[2][2]);
2127     /* 2^130*G + 2^390*G */
2128     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2129               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2130               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2131               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2132               pre->g_pre_comp[2][2]);
2133     /* 2^260*G + 2^390*G */
2134     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2135               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2136               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2137               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2138               pre->g_pre_comp[4][2]);
2139     /* 2^130*G + 2^260*G + 2^390*G */
2140     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2141               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2142               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2143               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2144               pre->g_pre_comp[2][2]);
2145     for (i = 1; i < 8; ++i) {
2146         /* odd multiples: add G */
2147         point_add(pre->g_pre_comp[2 * i + 1][0],
2148                   pre->g_pre_comp[2 * i + 1][1],
2149                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2150                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2151                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2152                   pre->g_pre_comp[1][2]);
2153     }
2154     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2155
2156  done:
2157     SETPRECOMP(group, nistp521, pre);
2158     ret = 1;
2159     pre = NULL;
2160  err:
2161     BN_CTX_end(ctx);
2162     EC_POINT_free(generator);
2163     BN_CTX_free(new_ctx);
2164     EC_nistp521_pre_comp_free(pre);
2165     return ret;
2166 }
2167
2168 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2169 {
2170     return HAVEPRECOMP(group, nistp521);
2171 }
2172
2173 #endif