crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * ECDSA low level APIs are deprecated for public use, but still ok for
  28  * internal use.
  29  */
  30 #include "internal/deprecated.h"
  31
  32 /*
  33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  34  *
  35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  37  * work which got its smarts from Daniel J. Bernstein's work on the same.
  38  */
  39
  40 #include <openssl/e_os2.h>
  41
  42 #include <string.h>
  43 #include <openssl/err.h>
  44 #include "ec_local.h"
  45
  46 #include "internal/numbers.h"
  47
  48 #ifndef INT128_MAX
  49 # error "Your compiler doesn't appear to support 128-bit integer types"
  50 #endif
  51
  52 typedef uint8_t u8;
  53 typedef uint64_t u64;
  54
  55 /*
  56  * The underlying field. P521 operates over GF(2^521-1). We can serialize an
  57  * element of this field into 66 bytes where the most significant byte
  58  * contains only a single bit. We call this an felem_bytearray.
  59  */
  60
  61 typedef u8 felem_bytearray[66];
  62
  63 /*
  64  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  65  * These values are big-endian.
  66  */
  67 static const felem_bytearray nistp521_curve_params[5] = {
  68     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76      0xff, 0xff},
  77     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  85      0xff, 0xfc},
  86     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  87      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  88      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  89      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  90      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  91      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  92      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  93      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  94      0x3f, 0x00},
  95     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  96      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  97      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  98      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  99      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
 100      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 101      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 102      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 103      0xbd, 0x66},
 104     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 105      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 106      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 107      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 108      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 109      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 110      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 111      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 112      0x66, 0x50}
 113 };
 114
 115 /*-
 116  * The representation of field elements.
 117  * ------------------------------------
 118  *
 119  * We represent field elements with nine values. These values are either 64 or
 120  * 128 bits and the field element represented is:
 121  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 122  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 123  * 58 bits apart, but are greater than 58 bits in length, the most significant
 124  * bits of each limb overlap with the least significant bits of the next.
 125  *
 126  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 127  * 'largefelem' */
 128
 129 #define NLIMBS 9
 130
 131 typedef uint64_t limb;
 132 typedef limb limb_aX __attribute((__aligned__(1)));
 133 typedef limb felem[NLIMBS];
 134 typedef uint128_t largefelem[NLIMBS];
 135
 136 static const limb bottom57bits = 0x1ffffffffffffff;
 137 static const limb bottom58bits = 0x3ffffffffffffff;
 138
 139 /*
 140  * bin66_to_felem takes a little-endian byte array and converts it into felem
 141  * form. This assumes that the CPU is little-endian.
 142  */
 143 static void bin66_to_felem(felem out, const u8 in[66])
 144 {
 145     out[0] = (*((limb *) & in[0])) & bottom58bits;
 146     out[1] = (*((limb_aX *) & in[7]) >> 2) & bottom58bits;
 147     out[2] = (*((limb_aX *) & in[14]) >> 4) & bottom58bits;
 148     out[3] = (*((limb_aX *) & in[21]) >> 6) & bottom58bits;
 149     out[4] = (*((limb_aX *) & in[29])) & bottom58bits;
 150     out[5] = (*((limb_aX *) & in[36]) >> 2) & bottom58bits;
 151     out[6] = (*((limb_aX *) & in[43]) >> 4) & bottom58bits;
 152     out[7] = (*((limb_aX *) & in[50]) >> 6) & bottom58bits;
 153     out[8] = (*((limb_aX *) & in[58])) & bottom57bits;
 154 }
 155
 156 /*
 157  * felem_to_bin66 takes an felem and serializes into a little endian, 66 byte
 158  * array. This assumes that the CPU is little-endian.
 159  */
 160 static void felem_to_bin66(u8 out[66], const felem in)
 161 {
 162     memset(out, 0, 66);
 163     (*((limb *) & out[0])) = in[0];
 164     (*((limb_aX *) & out[7])) |= in[1] << 2;
 165     (*((limb_aX *) & out[14])) |= in[2] << 4;
 166     (*((limb_aX *) & out[21])) |= in[3] << 6;
 167     (*((limb_aX *) & out[29])) = in[4];
 168     (*((limb_aX *) & out[36])) |= in[5] << 2;
 169     (*((limb_aX *) & out[43])) |= in[6] << 4;
 170     (*((limb_aX *) & out[50])) |= in[7] << 6;
 171     (*((limb_aX *) & out[58])) = in[8];
 172 }
 173
 174 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 175 static int BN_to_felem(felem out, const BIGNUM *bn)
 176 {
 177     felem_bytearray b_out;
 178     int num_bytes;
 179
 180     if (BN_is_negative(bn)) {
 181         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
 182         return 0;
 183     }
 184     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 185     if (num_bytes < 0) {
 186         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
 187         return 0;
 188     }
 189     bin66_to_felem(out, b_out);
 190     return 1;
 191 }
 192
 193 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 194 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 195 {
 196     felem_bytearray b_out;
 197     felem_to_bin66(b_out, in);
 198     return BN_lebin2bn(b_out, sizeof(b_out), out);
 199 }
 200
 201 /*-
 202  * Field operations
 203  * ----------------
 204  */
 205
 206 static void felem_one(felem out)
 207 {
 208     out[0] = 1;
 209     out[1] = 0;
 210     out[2] = 0;
 211     out[3] = 0;
 212     out[4] = 0;
 213     out[5] = 0;
 214     out[6] = 0;
 215     out[7] = 0;
 216     out[8] = 0;
 217 }
 218
 219 static void felem_assign(felem out, const felem in)
 220 {
 221     out[0] = in[0];
 222     out[1] = in[1];
 223     out[2] = in[2];
 224     out[3] = in[3];
 225     out[4] = in[4];
 226     out[5] = in[5];
 227     out[6] = in[6];
 228     out[7] = in[7];
 229     out[8] = in[8];
 230 }
 231
 232 /* felem_sum64 sets out = out + in. */
 233 static void felem_sum64(felem out, const felem in)
 234 {
 235     out[0] += in[0];
 236     out[1] += in[1];
 237     out[2] += in[2];
 238     out[3] += in[3];
 239     out[4] += in[4];
 240     out[5] += in[5];
 241     out[6] += in[6];
 242     out[7] += in[7];
 243     out[8] += in[8];
 244 }
 245
 246 /* felem_scalar sets out = in * scalar */
 247 static void felem_scalar(felem out, const felem in, limb scalar)
 248 {
 249     out[0] = in[0] * scalar;
 250     out[1] = in[1] * scalar;
 251     out[2] = in[2] * scalar;
 252     out[3] = in[3] * scalar;
 253     out[4] = in[4] * scalar;
 254     out[5] = in[5] * scalar;
 255     out[6] = in[6] * scalar;
 256     out[7] = in[7] * scalar;
 257     out[8] = in[8] * scalar;
 258 }
 259
 260 /* felem_scalar64 sets out = out * scalar */
 261 static void felem_scalar64(felem out, limb scalar)
 262 {
 263     out[0] *= scalar;
 264     out[1] *= scalar;
 265     out[2] *= scalar;
 266     out[3] *= scalar;
 267     out[4] *= scalar;
 268     out[5] *= scalar;
 269     out[6] *= scalar;
 270     out[7] *= scalar;
 271     out[8] *= scalar;
 272 }
 273
 274 /* felem_scalar128 sets out = out * scalar */
 275 static void felem_scalar128(largefelem out, limb scalar)
 276 {
 277     out[0] *= scalar;
 278     out[1] *= scalar;
 279     out[2] *= scalar;
 280     out[3] *= scalar;
 281     out[4] *= scalar;
 282     out[5] *= scalar;
 283     out[6] *= scalar;
 284     out[7] *= scalar;
 285     out[8] *= scalar;
 286 }
 287
 288 /*-
 289  * felem_neg sets |out| to |-in|
 290  * On entry:
 291  *   in[i] < 2^59 + 2^14
 292  * On exit:
 293  *   out[i] < 2^62
 294  */
 295 static void felem_neg(felem out, const felem in)
 296 {
 297     /* In order to prevent underflow, we subtract from 0 mod p. */
 298     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 299     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 300
 301     out[0] = two62m3 - in[0];
 302     out[1] = two62m2 - in[1];
 303     out[2] = two62m2 - in[2];
 304     out[3] = two62m2 - in[3];
 305     out[4] = two62m2 - in[4];
 306     out[5] = two62m2 - in[5];
 307     out[6] = two62m2 - in[6];
 308     out[7] = two62m2 - in[7];
 309     out[8] = two62m2 - in[8];
 310 }
 311
 312 /*-
 313  * felem_diff64 subtracts |in| from |out|
 314  * On entry:
 315  *   in[i] < 2^59 + 2^14
 316  * On exit:
 317  *   out[i] < out[i] + 2^62
 318  */
 319 static void felem_diff64(felem out, const felem in)
 320 {
 321     /*
 322      * In order to prevent underflow, we add 0 mod p before subtracting.
 323      */
 324     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 325     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 326
 327     out[0] += two62m3 - in[0];
 328     out[1] += two62m2 - in[1];
 329     out[2] += two62m2 - in[2];
 330     out[3] += two62m2 - in[3];
 331     out[4] += two62m2 - in[4];
 332     out[5] += two62m2 - in[5];
 333     out[6] += two62m2 - in[6];
 334     out[7] += two62m2 - in[7];
 335     out[8] += two62m2 - in[8];
 336 }
 337
 338 /*-
 339  * felem_diff_128_64 subtracts |in| from |out|
 340  * On entry:
 341  *   in[i] < 2^62 + 2^17
 342  * On exit:
 343  *   out[i] < out[i] + 2^63
 344  */
 345 static void felem_diff_128_64(largefelem out, const felem in)
 346 {
 347     /*
 348      * In order to prevent underflow, we add 64p mod p (which is equivalent
 349      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 350      * digit number with all bits set to 1. See "The representation of field
 351      * elements" comment above for a description of how limbs are used to
 352      * represent a number. 64p is represented with 8 limbs containing a number
 353      * with 58 bits set and one limb with a number with 57 bits set.
 354      */
 355     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 356     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 357
 358     out[0] += two63m6 - in[0];
 359     out[1] += two63m5 - in[1];
 360     out[2] += two63m5 - in[2];
 361     out[3] += two63m5 - in[3];
 362     out[4] += two63m5 - in[4];
 363     out[5] += two63m5 - in[5];
 364     out[6] += two63m5 - in[6];
 365     out[7] += two63m5 - in[7];
 366     out[8] += two63m5 - in[8];
 367 }
 368
 369 /*-
 370  * felem_diff_128_64 subtracts |in| from |out|
 371  * On entry:
 372  *   in[i] < 2^126
 373  * On exit:
 374  *   out[i] < out[i] + 2^127 - 2^69
 375  */
 376 static void felem_diff128(largefelem out, const largefelem in)
 377 {
 378     /*
 379      * In order to prevent underflow, we add 0 mod p before subtracting.
 380      */
 381     static const uint128_t two127m70 =
 382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 383     static const uint128_t two127m69 =
 384         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 385
 386     out[0] += (two127m70 - in[0]);
 387     out[1] += (two127m69 - in[1]);
 388     out[2] += (two127m69 - in[2]);
 389     out[3] += (two127m69 - in[3]);
 390     out[4] += (two127m69 - in[4]);
 391     out[5] += (two127m69 - in[5]);
 392     out[6] += (two127m69 - in[6]);
 393     out[7] += (two127m69 - in[7]);
 394     out[8] += (two127m69 - in[8]);
 395 }
 396
 397 /*-
 398  * felem_square sets |out| = |in|^2
 399  * On entry:
 400  *   in[i] < 2^62
 401  * On exit:
 402  *   out[i] < 17 * max(in[i]) * max(in[i])
 403  */
 404 static void felem_square_ref(largefelem out, const felem in)
 405 {
 406     felem inx2, inx4;
 407     felem_scalar(inx2, in, 2);
 408     felem_scalar(inx4, in, 4);
 409
 410     /*-
 411      * We have many cases were we want to do
 412      *   in[x] * in[y] +
 413      *   in[y] * in[x]
 414      * This is obviously just
 415      *   2 * in[x] * in[y]
 416      * However, rather than do the doubling on the 128 bit result, we
 417      * double one of the inputs to the multiplication by reading from
 418      * |inx2|
 419      */
 420
 421     out[0] = ((uint128_t) in[0]) * in[0];
 422     out[1] = ((uint128_t) in[0]) * inx2[1];
 423     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 424     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 425     out[4] = ((uint128_t) in[0]) * inx2[4] +
 426              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 427     out[5] = ((uint128_t) in[0]) * inx2[5] +
 428              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 429     out[6] = ((uint128_t) in[0]) * inx2[6] +
 430              ((uint128_t) in[1]) * inx2[5] +
 431              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 432     out[7] = ((uint128_t) in[0]) * inx2[7] +
 433              ((uint128_t) in[1]) * inx2[6] +
 434              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 435     out[8] = ((uint128_t) in[0]) * inx2[8] +
 436              ((uint128_t) in[1]) * inx2[7] +
 437              ((uint128_t) in[2]) * inx2[6] +
 438              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 439
 440     /*
 441      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 442      * They correspond to locations one bit up from the limbs produced above
 443      * so we would have to multiply by two to align them. Again, rather than
 444      * operate on the 128-bit result, we double one of the inputs to the
 445      * multiplication. If we want to double for both this reason, and the
 446      * reason above, then we end up multiplying by four.
 447      */
 448
 449     /* 9 */
 450     out[0] += ((uint128_t) in[1]) * inx4[8] +
 451               ((uint128_t) in[2]) * inx4[7] +
 452               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 453
 454     /* 10 */
 455     out[1] += ((uint128_t) in[2]) * inx4[8] +
 456               ((uint128_t) in[3]) * inx4[7] +
 457               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 458
 459     /* 11 */
 460     out[2] += ((uint128_t) in[3]) * inx4[8] +
 461               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 462
 463     /* 12 */
 464     out[3] += ((uint128_t) in[4]) * inx4[8] +
 465               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 466
 467     /* 13 */
 468     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 469
 470     /* 14 */
 471     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 472
 473     /* 15 */
 474     out[6] += ((uint128_t) in[7]) * inx4[8];
 475
 476     /* 16 */
 477     out[7] += ((uint128_t) in[8]) * inx2[8];
 478 }
 479
 480 /*-
 481  * felem_mul sets |out| = |in1| * |in2|
 482  * On entry:
 483  *   in1[i] < 2^64
 484  *   in2[i] < 2^63
 485  * On exit:
 486  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 487  */
 488 static void felem_mul_ref(largefelem out, const felem in1, const felem in2)
 489 {
 490     felem in2x2;
 491     felem_scalar(in2x2, in2, 2);
 492
 493     out[0] = ((uint128_t) in1[0]) * in2[0];
 494
 495     out[1] = ((uint128_t) in1[0]) * in2[1] +
 496              ((uint128_t) in1[1]) * in2[0];
 497
 498     out[2] = ((uint128_t) in1[0]) * in2[2] +
 499              ((uint128_t) in1[1]) * in2[1] +
 500              ((uint128_t) in1[2]) * in2[0];
 501
 502     out[3] = ((uint128_t) in1[0]) * in2[3] +
 503              ((uint128_t) in1[1]) * in2[2] +
 504              ((uint128_t) in1[2]) * in2[1] +
 505              ((uint128_t) in1[3]) * in2[0];
 506
 507     out[4] = ((uint128_t) in1[0]) * in2[4] +
 508              ((uint128_t) in1[1]) * in2[3] +
 509              ((uint128_t) in1[2]) * in2[2] +
 510              ((uint128_t) in1[3]) * in2[1] +
 511              ((uint128_t) in1[4]) * in2[0];
 512
 513     out[5] = ((uint128_t) in1[0]) * in2[5] +
 514              ((uint128_t) in1[1]) * in2[4] +
 515              ((uint128_t) in1[2]) * in2[3] +
 516              ((uint128_t) in1[3]) * in2[2] +
 517              ((uint128_t) in1[4]) * in2[1] +
 518              ((uint128_t) in1[5]) * in2[0];
 519
 520     out[6] = ((uint128_t) in1[0]) * in2[6] +
 521              ((uint128_t) in1[1]) * in2[5] +
 522              ((uint128_t) in1[2]) * in2[4] +
 523              ((uint128_t) in1[3]) * in2[3] +
 524              ((uint128_t) in1[4]) * in2[2] +
 525              ((uint128_t) in1[5]) * in2[1] +
 526              ((uint128_t) in1[6]) * in2[0];
 527
 528     out[7] = ((uint128_t) in1[0]) * in2[7] +
 529              ((uint128_t) in1[1]) * in2[6] +
 530              ((uint128_t) in1[2]) * in2[5] +
 531              ((uint128_t) in1[3]) * in2[4] +
 532              ((uint128_t) in1[4]) * in2[3] +
 533              ((uint128_t) in1[5]) * in2[2] +
 534              ((uint128_t) in1[6]) * in2[1] +
 535              ((uint128_t) in1[7]) * in2[0];
 536
 537     out[8] = ((uint128_t) in1[0]) * in2[8] +
 538              ((uint128_t) in1[1]) * in2[7] +
 539              ((uint128_t) in1[2]) * in2[6] +
 540              ((uint128_t) in1[3]) * in2[5] +
 541              ((uint128_t) in1[4]) * in2[4] +
 542              ((uint128_t) in1[5]) * in2[3] +
 543              ((uint128_t) in1[6]) * in2[2] +
 544              ((uint128_t) in1[7]) * in2[1] +
 545              ((uint128_t) in1[8]) * in2[0];
 546
 547     /* See comment in felem_square about the use of in2x2 here */
 548
 549     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 550               ((uint128_t) in1[2]) * in2x2[7] +
 551               ((uint128_t) in1[3]) * in2x2[6] +
 552               ((uint128_t) in1[4]) * in2x2[5] +
 553               ((uint128_t) in1[5]) * in2x2[4] +
 554               ((uint128_t) in1[6]) * in2x2[3] +
 555               ((uint128_t) in1[7]) * in2x2[2] +
 556               ((uint128_t) in1[8]) * in2x2[1];
 557
 558     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 559               ((uint128_t) in1[3]) * in2x2[7] +
 560               ((uint128_t) in1[4]) * in2x2[6] +
 561               ((uint128_t) in1[5]) * in2x2[5] +
 562               ((uint128_t) in1[6]) * in2x2[4] +
 563               ((uint128_t) in1[7]) * in2x2[3] +
 564               ((uint128_t) in1[8]) * in2x2[2];
 565
 566     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 567               ((uint128_t) in1[4]) * in2x2[7] +
 568               ((uint128_t) in1[5]) * in2x2[6] +
 569               ((uint128_t) in1[6]) * in2x2[5] +
 570               ((uint128_t) in1[7]) * in2x2[4] +
 571               ((uint128_t) in1[8]) * in2x2[3];
 572
 573     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 574               ((uint128_t) in1[5]) * in2x2[7] +
 575               ((uint128_t) in1[6]) * in2x2[6] +
 576               ((uint128_t) in1[7]) * in2x2[5] +
 577               ((uint128_t) in1[8]) * in2x2[4];
 578
 579     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 580               ((uint128_t) in1[6]) * in2x2[7] +
 581               ((uint128_t) in1[7]) * in2x2[6] +
 582               ((uint128_t) in1[8]) * in2x2[5];
 583
 584     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 585               ((uint128_t) in1[7]) * in2x2[7] +
 586               ((uint128_t) in1[8]) * in2x2[6];
 587
 588     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 589               ((uint128_t) in1[8]) * in2x2[7];
 590
 591     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 592 }
 593
 594 static const limb bottom52bits = 0xfffffffffffff;
 595
 596 /*-
 597  * felem_reduce converts a largefelem to an felem.
 598  * On entry:
 599  *   in[i] < 2^128
 600  * On exit:
 601  *   out[i] < 2^59 + 2^14
 602  */
 603 static void felem_reduce(felem out, const largefelem in)
 604 {
 605     u64 overflow1, overflow2;
 606
 607     out[0] = ((limb) in[0]) & bottom58bits;
 608     out[1] = ((limb) in[1]) & bottom58bits;
 609     out[2] = ((limb) in[2]) & bottom58bits;
 610     out[3] = ((limb) in[3]) & bottom58bits;
 611     out[4] = ((limb) in[4]) & bottom58bits;
 612     out[5] = ((limb) in[5]) & bottom58bits;
 613     out[6] = ((limb) in[6]) & bottom58bits;
 614     out[7] = ((limb) in[7]) & bottom58bits;
 615     out[8] = ((limb) in[8]) & bottom58bits;
 616
 617     /* out[i] < 2^58 */
 618
 619     out[1] += ((limb) in[0]) >> 58;
 620     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 621     /*-
 622      * out[1] < 2^58 + 2^6 + 2^58
 623      *        = 2^59 + 2^6
 624      */
 625     out[2] += ((limb) (in[0] >> 64)) >> 52;
 626
 627     out[2] += ((limb) in[1]) >> 58;
 628     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 629     out[3] += ((limb) (in[1] >> 64)) >> 52;
 630
 631     out[3] += ((limb) in[2]) >> 58;
 632     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 633     out[4] += ((limb) (in[2] >> 64)) >> 52;
 634
 635     out[4] += ((limb) in[3]) >> 58;
 636     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 637     out[5] += ((limb) (in[3] >> 64)) >> 52;
 638
 639     out[5] += ((limb) in[4]) >> 58;
 640     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 641     out[6] += ((limb) (in[4] >> 64)) >> 52;
 642
 643     out[6] += ((limb) in[5]) >> 58;
 644     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 645     out[7] += ((limb) (in[5] >> 64)) >> 52;
 646
 647     out[7] += ((limb) in[6]) >> 58;
 648     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 649     out[8] += ((limb) (in[6] >> 64)) >> 52;
 650
 651     out[8] += ((limb) in[7]) >> 58;
 652     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 653     /*-
 654      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 655      *            < 2^59 + 2^13
 656      */
 657     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 658
 659     overflow1 += ((limb) in[8]) >> 58;
 660     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 661     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 662
 663     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 664     overflow2 <<= 1;            /* overflow2 < 2^13 */
 665
 666     out[0] += overflow1;        /* out[0] < 2^60 */
 667     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 668
 669     out[1] += out[0] >> 58;
 670     out[0] &= bottom58bits;
 671     /*-
 672      * out[0] < 2^58
 673      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 674      *        < 2^59 + 2^14
 675      */
 676 }
 677
 678 #if defined(ECP_NISTP521_ASM)
 679 void felem_square_wrapper(largefelem out, const felem in);
 680 void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
 681
 682 static void (*felem_square_p)(largefelem out, const felem in) =
 683     felem_square_wrapper;
 684 static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) =
 685     felem_mul_wrapper;
 686
 687 void p521_felem_square(largefelem out, const felem in);
 688 void p521_felem_mul(largefelem out, const felem in1, const felem in2);
 689
 690 # if defined(_ARCH_PPC64)
 691 #  include "crypto/ppc_arch.h"
 692 # endif
 693
 694 void felem_select(void)
 695 {
 696 # if defined(_ARCH_PPC64)
 697     if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
 698         felem_square_p = p521_felem_square;
 699         felem_mul_p = p521_felem_mul;
 700
 701         return;
 702     }
 703 # endif
 704
 705     /* Default */
 706     felem_square_p = felem_square_ref;
 707     felem_mul_p = felem_mul_ref;
 708 }
 709
 710 void felem_square_wrapper(largefelem out, const felem in)
 711 {
 712     felem_select();
 713     felem_square_p(out, in);
 714 }
 715
 716 void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
 717 {
 718     felem_select();
 719     felem_mul_p(out, in1, in2);
 720 }
 721
 722 # define felem_square felem_square_p
 723 # define felem_mul felem_mul_p
 724 #else
 725 # define felem_square felem_square_ref
 726 # define felem_mul felem_mul_ref
 727 #endif
 728
 729 static void felem_square_reduce(felem out, const felem in)
 730 {
 731     largefelem tmp;
 732     felem_square(tmp, in);
 733     felem_reduce(out, tmp);
 734 }
 735
 736 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 737 {
 738     largefelem tmp;
 739     felem_mul(tmp, in1, in2);
 740     felem_reduce(out, tmp);
 741 }
 742
 743 /*-
 744  * felem_inv calculates |out| = |in|^{-1}
 745  *
 746  * Based on Fermat's Little Theorem:
 747  *   a^p = a (mod p)
 748  *   a^{p-1} = 1 (mod p)
 749  *   a^{p-2} = a^{-1} (mod p)
 750  */
 751 static void felem_inv(felem out, const felem in)
 752 {
 753     felem ftmp, ftmp2, ftmp3, ftmp4;
 754     largefelem tmp;
 755     unsigned i;
 756
 757     felem_square(tmp, in);
 758     felem_reduce(ftmp, tmp);    /* 2^1 */
 759     felem_mul(tmp, in, ftmp);
 760     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 761     felem_assign(ftmp2, ftmp);
 762     felem_square(tmp, ftmp);
 763     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 764     felem_mul(tmp, in, ftmp);
 765     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 766     felem_square(tmp, ftmp);
 767     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 768
 769     felem_square(tmp, ftmp2);
 770     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 771     felem_square(tmp, ftmp3);
 772     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 773     felem_mul(tmp, ftmp3, ftmp2);
 774     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 775
 776     felem_assign(ftmp2, ftmp3);
 777     felem_square(tmp, ftmp3);
 778     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 779     felem_square(tmp, ftmp3);
 780     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 781     felem_square(tmp, ftmp3);
 782     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 783     felem_square(tmp, ftmp3);
 784     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 785     felem_assign(ftmp4, ftmp3);
 786     felem_mul(tmp, ftmp3, ftmp);
 787     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 788     felem_square(tmp, ftmp4);
 789     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 790     felem_mul(tmp, ftmp3, ftmp2);
 791     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 792     felem_assign(ftmp2, ftmp3);
 793
 794     for (i = 0; i < 8; i++) {
 795         felem_square(tmp, ftmp3);
 796         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 797     }
 798     felem_mul(tmp, ftmp3, ftmp2);
 799     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 800     felem_assign(ftmp2, ftmp3);
 801
 802     for (i = 0; i < 16; i++) {
 803         felem_square(tmp, ftmp3);
 804         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 805     }
 806     felem_mul(tmp, ftmp3, ftmp2);
 807     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 808     felem_assign(ftmp2, ftmp3);
 809
 810     for (i = 0; i < 32; i++) {
 811         felem_square(tmp, ftmp3);
 812         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 813     }
 814     felem_mul(tmp, ftmp3, ftmp2);
 815     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 816     felem_assign(ftmp2, ftmp3);
 817
 818     for (i = 0; i < 64; i++) {
 819         felem_square(tmp, ftmp3);
 820         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 821     }
 822     felem_mul(tmp, ftmp3, ftmp2);
 823     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 824     felem_assign(ftmp2, ftmp3);
 825
 826     for (i = 0; i < 128; i++) {
 827         felem_square(tmp, ftmp3);
 828         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 829     }
 830     felem_mul(tmp, ftmp3, ftmp2);
 831     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 832     felem_assign(ftmp2, ftmp3);
 833
 834     for (i = 0; i < 256; i++) {
 835         felem_square(tmp, ftmp3);
 836         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 837     }
 838     felem_mul(tmp, ftmp3, ftmp2);
 839     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 840
 841     for (i = 0; i < 9; i++) {
 842         felem_square(tmp, ftmp3);
 843         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 844     }
 845     felem_mul(tmp, ftmp3, ftmp4);
 846     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 847     felem_mul(tmp, ftmp3, in);
 848     felem_reduce(out, tmp);     /* 2^512 - 3 */
 849 }
 850
 851 /* This is 2^521-1, expressed as an felem */
 852 static const felem kPrime = {
 853     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 854     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 855     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 856 };
 857
 858 /*-
 859  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 860  * otherwise.
 861  * On entry:
 862  *   in[i] < 2^59 + 2^14
 863  */
 864 static limb felem_is_zero(const felem in)
 865 {
 866     felem ftmp;
 867     limb is_zero, is_p;
 868     felem_assign(ftmp, in);
 869
 870     ftmp[0] += ftmp[8] >> 57;
 871     ftmp[8] &= bottom57bits;
 872     /* ftmp[8] < 2^57 */
 873     ftmp[1] += ftmp[0] >> 58;
 874     ftmp[0] &= bottom58bits;
 875     ftmp[2] += ftmp[1] >> 58;
 876     ftmp[1] &= bottom58bits;
 877     ftmp[3] += ftmp[2] >> 58;
 878     ftmp[2] &= bottom58bits;
 879     ftmp[4] += ftmp[3] >> 58;
 880     ftmp[3] &= bottom58bits;
 881     ftmp[5] += ftmp[4] >> 58;
 882     ftmp[4] &= bottom58bits;
 883     ftmp[6] += ftmp[5] >> 58;
 884     ftmp[5] &= bottom58bits;
 885     ftmp[7] += ftmp[6] >> 58;
 886     ftmp[6] &= bottom58bits;
 887     ftmp[8] += ftmp[7] >> 58;
 888     ftmp[7] &= bottom58bits;
 889     /* ftmp[8] < 2^57 + 4 */
 890
 891     /*
 892      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 893      * than our bound for ftmp[8]. Therefore we only have to check if the
 894      * zero is zero or 2^521-1.
 895      */
 896
 897     is_zero = 0;
 898     is_zero |= ftmp[0];
 899     is_zero |= ftmp[1];
 900     is_zero |= ftmp[2];
 901     is_zero |= ftmp[3];
 902     is_zero |= ftmp[4];
 903     is_zero |= ftmp[5];
 904     is_zero |= ftmp[6];
 905     is_zero |= ftmp[7];
 906     is_zero |= ftmp[8];
 907
 908     is_zero--;
 909     /*
 910      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 911      * can be set is if is_zero was 0 before the decrement.
 912      */
 913     is_zero = 0 - (is_zero >> 63);
 914
 915     is_p = ftmp[0] ^ kPrime[0];
 916     is_p |= ftmp[1] ^ kPrime[1];
 917     is_p |= ftmp[2] ^ kPrime[2];
 918     is_p |= ftmp[3] ^ kPrime[3];
 919     is_p |= ftmp[4] ^ kPrime[4];
 920     is_p |= ftmp[5] ^ kPrime[5];
 921     is_p |= ftmp[6] ^ kPrime[6];
 922     is_p |= ftmp[7] ^ kPrime[7];
 923     is_p |= ftmp[8] ^ kPrime[8];
 924
 925     is_p--;
 926     is_p = 0 - (is_p >> 63);
 927
 928     is_zero |= is_p;
 929     return is_zero;
 930 }
 931
 932 static int felem_is_zero_int(const void *in)
 933 {
 934     return (int)(felem_is_zero(in) & ((limb) 1));
 935 }
 936
 937 /*-
 938  * felem_contract converts |in| to its unique, minimal representation.
 939  * On entry:
 940  *   in[i] < 2^59 + 2^14
 941  */
 942 static void felem_contract(felem out, const felem in)
 943 {
 944     limb is_p, is_greater, sign;
 945     static const limb two58 = ((limb) 1) << 58;
 946
 947     felem_assign(out, in);
 948
 949     out[0] += out[8] >> 57;
 950     out[8] &= bottom57bits;
 951     /* out[8] < 2^57 */
 952     out[1] += out[0] >> 58;
 953     out[0] &= bottom58bits;
 954     out[2] += out[1] >> 58;
 955     out[1] &= bottom58bits;
 956     out[3] += out[2] >> 58;
 957     out[2] &= bottom58bits;
 958     out[4] += out[3] >> 58;
 959     out[3] &= bottom58bits;
 960     out[5] += out[4] >> 58;
 961     out[4] &= bottom58bits;
 962     out[6] += out[5] >> 58;
 963     out[5] &= bottom58bits;
 964     out[7] += out[6] >> 58;
 965     out[6] &= bottom58bits;
 966     out[8] += out[7] >> 58;
 967     out[7] &= bottom58bits;
 968     /* out[8] < 2^57 + 4 */
 969
 970     /*
 971      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 972      * out. See the comments in felem_is_zero regarding why we don't test for
 973      * other multiples of the prime.
 974      */
 975
 976     /*
 977      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 978      */
 979
 980     is_p = out[0] ^ kPrime[0];
 981     is_p |= out[1] ^ kPrime[1];
 982     is_p |= out[2] ^ kPrime[2];
 983     is_p |= out[3] ^ kPrime[3];
 984     is_p |= out[4] ^ kPrime[4];
 985     is_p |= out[5] ^ kPrime[5];
 986     is_p |= out[6] ^ kPrime[6];
 987     is_p |= out[7] ^ kPrime[7];
 988     is_p |= out[8] ^ kPrime[8];
 989
 990     is_p--;
 991     is_p &= is_p << 32;
 992     is_p &= is_p << 16;
 993     is_p &= is_p << 8;
 994     is_p &= is_p << 4;
 995     is_p &= is_p << 2;
 996     is_p &= is_p << 1;
 997     is_p = 0 - (is_p >> 63);
 998     is_p = ~is_p;
 999
1000     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
1001
1002     out[0] &= is_p;
1003     out[1] &= is_p;
1004     out[2] &= is_p;
1005     out[3] &= is_p;
1006     out[4] &= is_p;
1007     out[5] &= is_p;
1008     out[6] &= is_p;
1009     out[7] &= is_p;
1010     out[8] &= is_p;
1011
1012     /*
1013      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
1014      * 57 is greater than zero as (2^521-1) + x >= 2^522
1015      */
1016     is_greater = out[8] >> 57;
1017     is_greater |= is_greater << 32;
1018     is_greater |= is_greater << 16;
1019     is_greater |= is_greater << 8;
1020     is_greater |= is_greater << 4;
1021     is_greater |= is_greater << 2;
1022     is_greater |= is_greater << 1;
1023     is_greater = 0 - (is_greater >> 63);
1024
1025     out[0] -= kPrime[0] & is_greater;
1026     out[1] -= kPrime[1] & is_greater;
1027     out[2] -= kPrime[2] & is_greater;
1028     out[3] -= kPrime[3] & is_greater;
1029     out[4] -= kPrime[4] & is_greater;
1030     out[5] -= kPrime[5] & is_greater;
1031     out[6] -= kPrime[6] & is_greater;
1032     out[7] -= kPrime[7] & is_greater;
1033     out[8] -= kPrime[8] & is_greater;
1034
1035     /* Eliminate negative coefficients */
1036     sign = -(out[0] >> 63);
1037     out[0] += (two58 & sign);
1038     out[1] -= (1 & sign);
1039     sign = -(out[1] >> 63);
1040     out[1] += (two58 & sign);
1041     out[2] -= (1 & sign);
1042     sign = -(out[2] >> 63);
1043     out[2] += (two58 & sign);
1044     out[3] -= (1 & sign);
1045     sign = -(out[3] >> 63);
1046     out[3] += (two58 & sign);
1047     out[4] -= (1 & sign);
1048     sign = -(out[4] >> 63);
1049     out[4] += (two58 & sign);
1050     out[5] -= (1 & sign);
1051     sign = -(out[0] >> 63);
1052     out[5] += (two58 & sign);
1053     out[6] -= (1 & sign);
1054     sign = -(out[6] >> 63);
1055     out[6] += (two58 & sign);
1056     out[7] -= (1 & sign);
1057     sign = -(out[7] >> 63);
1058     out[7] += (two58 & sign);
1059     out[8] -= (1 & sign);
1060     sign = -(out[5] >> 63);
1061     out[5] += (two58 & sign);
1062     out[6] -= (1 & sign);
1063     sign = -(out[6] >> 63);
1064     out[6] += (two58 & sign);
1065     out[7] -= (1 & sign);
1066     sign = -(out[7] >> 63);
1067     out[7] += (two58 & sign);
1068     out[8] -= (1 & sign);
1069 }
1070
1071 /*-
1072  * Group operations
1073  * ----------------
1074  *
1075  * Building on top of the field operations we have the operations on the
1076  * elliptic curve group itself. Points on the curve are represented in Jacobian
1077  * coordinates */
1078
1079 /*-
1080  * point_double calculates 2*(x_in, y_in, z_in)
1081  *
1082  * The method is taken from:
1083  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1084  *
1085  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1086  * while x_out == y_in is not (maybe this works, but it's not tested). */
1087 static void
1088 point_double(felem x_out, felem y_out, felem z_out,
1089              const felem x_in, const felem y_in, const felem z_in)
1090 {
1091     largefelem tmp, tmp2;
1092     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1093
1094     felem_assign(ftmp, x_in);
1095     felem_assign(ftmp2, x_in);
1096
1097     /* delta = z^2 */
1098     felem_square(tmp, z_in);
1099     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1100
1101     /* gamma = y^2 */
1102     felem_square(tmp, y_in);
1103     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1104
1105     /* beta = x*gamma */
1106     felem_mul(tmp, x_in, gamma);
1107     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1108
1109     /* alpha = 3*(x-delta)*(x+delta) */
1110     felem_diff64(ftmp, delta);
1111     /* ftmp[i] < 2^61 */
1112     felem_sum64(ftmp2, delta);
1113     /* ftmp2[i] < 2^60 + 2^15 */
1114     felem_scalar64(ftmp2, 3);
1115     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1116     felem_mul(tmp, ftmp, ftmp2);
1117     /*-
1118      * tmp[i] < 17(3*2^121 + 3*2^76)
1119      *        = 61*2^121 + 61*2^76
1120      *        < 64*2^121 + 64*2^76
1121      *        = 2^127 + 2^82
1122      *        < 2^128
1123      */
1124     felem_reduce(alpha, tmp);
1125
1126     /* x' = alpha^2 - 8*beta */
1127     felem_square(tmp, alpha);
1128     /*
1129      * tmp[i] < 17*2^120 < 2^125
1130      */
1131     felem_assign(ftmp, beta);
1132     felem_scalar64(ftmp, 8);
1133     /* ftmp[i] < 2^62 + 2^17 */
1134     felem_diff_128_64(tmp, ftmp);
1135     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1136     felem_reduce(x_out, tmp);
1137
1138     /* z' = (y + z)^2 - gamma - delta */
1139     felem_sum64(delta, gamma);
1140     /* delta[i] < 2^60 + 2^15 */
1141     felem_assign(ftmp, y_in);
1142     felem_sum64(ftmp, z_in);
1143     /* ftmp[i] < 2^60 + 2^15 */
1144     felem_square(tmp, ftmp);
1145     /*
1146      * tmp[i] < 17(2^122) < 2^127
1147      */
1148     felem_diff_128_64(tmp, delta);
1149     /* tmp[i] < 2^127 + 2^63 */
1150     felem_reduce(z_out, tmp);
1151
1152     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1153     felem_scalar64(beta, 4);
1154     /* beta[i] < 2^61 + 2^16 */
1155     felem_diff64(beta, x_out);
1156     /* beta[i] < 2^61 + 2^60 + 2^16 */
1157     felem_mul(tmp, alpha, beta);
1158     /*-
1159      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1160      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1161      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1162      *        < 2^128
1163      */
1164     felem_square(tmp2, gamma);
1165     /*-
1166      * tmp2[i] < 17*(2^59 + 2^14)^2
1167      *         = 17*(2^118 + 2^74 + 2^28)
1168      */
1169     felem_scalar128(tmp2, 8);
1170     /*-
1171      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1172      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1173      *         < 2^126
1174      */
1175     felem_diff128(tmp, tmp2);
1176     /*-
1177      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1178      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1179      *          2^74 + 2^69 + 2^34 + 2^30
1180      *        < 2^128
1181      */
1182     felem_reduce(y_out, tmp);
1183 }
1184
1185 /* copy_conditional copies in to out iff mask is all ones. */
1186 static void copy_conditional(felem out, const felem in, limb mask)
1187 {
1188     unsigned i;
1189     for (i = 0; i < NLIMBS; ++i) {
1190         const limb tmp = mask & (in[i] ^ out[i]);
1191         out[i] ^= tmp;
1192     }
1193 }
1194
1195 /*-
1196  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1197  *
1198  * The method is taken from
1199  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1200  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1201  *
1202  * This function includes a branch for checking whether the two input points
1203  * are equal (while not equal to the point at infinity). See comment below
1204  * on constant-time.
1205  */
1206 static void point_add(felem x3, felem y3, felem z3,
1207                       const felem x1, const felem y1, const felem z1,
1208                       const int mixed, const felem x2, const felem y2,
1209                       const felem z2)
1210 {
1211     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1212     largefelem tmp, tmp2;
1213     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1214     limb points_equal;
1215
1216     z1_is_zero = felem_is_zero(z1);
1217     z2_is_zero = felem_is_zero(z2);
1218
1219     /* ftmp = z1z1 = z1**2 */
1220     felem_square(tmp, z1);
1221     felem_reduce(ftmp, tmp);
1222
1223     if (!mixed) {
1224         /* ftmp2 = z2z2 = z2**2 */
1225         felem_square(tmp, z2);
1226         felem_reduce(ftmp2, tmp);
1227
1228         /* u1 = ftmp3 = x1*z2z2 */
1229         felem_mul(tmp, x1, ftmp2);
1230         felem_reduce(ftmp3, tmp);
1231
1232         /* ftmp5 = z1 + z2 */
1233         felem_assign(ftmp5, z1);
1234         felem_sum64(ftmp5, z2);
1235         /* ftmp5[i] < 2^61 */
1236
1237         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1238         felem_square(tmp, ftmp5);
1239         /* tmp[i] < 17*2^122 */
1240         felem_diff_128_64(tmp, ftmp);
1241         /* tmp[i] < 17*2^122 + 2^63 */
1242         felem_diff_128_64(tmp, ftmp2);
1243         /* tmp[i] < 17*2^122 + 2^64 */
1244         felem_reduce(ftmp5, tmp);
1245
1246         /* ftmp2 = z2 * z2z2 */
1247         felem_mul(tmp, ftmp2, z2);
1248         felem_reduce(ftmp2, tmp);
1249
1250         /* s1 = ftmp6 = y1 * z2**3 */
1251         felem_mul(tmp, y1, ftmp2);
1252         felem_reduce(ftmp6, tmp);
1253     } else {
1254         /*
1255          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1256          */
1257
1258         /* u1 = ftmp3 = x1*z2z2 */
1259         felem_assign(ftmp3, x1);
1260
1261         /* ftmp5 = 2*z1z2 */
1262         felem_scalar(ftmp5, z1, 2);
1263
1264         /* s1 = ftmp6 = y1 * z2**3 */
1265         felem_assign(ftmp6, y1);
1266     }
1267
1268     /* u2 = x2*z1z1 */
1269     felem_mul(tmp, x2, ftmp);
1270     /* tmp[i] < 17*2^120 */
1271
1272     /* h = ftmp4 = u2 - u1 */
1273     felem_diff_128_64(tmp, ftmp3);
1274     /* tmp[i] < 17*2^120 + 2^63 */
1275     felem_reduce(ftmp4, tmp);
1276
1277     x_equal = felem_is_zero(ftmp4);
1278
1279     /* z_out = ftmp5 * h */
1280     felem_mul(tmp, ftmp5, ftmp4);
1281     felem_reduce(z_out, tmp);
1282
1283     /* ftmp = z1 * z1z1 */
1284     felem_mul(tmp, ftmp, z1);
1285     felem_reduce(ftmp, tmp);
1286
1287     /* s2 = tmp = y2 * z1**3 */
1288     felem_mul(tmp, y2, ftmp);
1289     /* tmp[i] < 17*2^120 */
1290
1291     /* r = ftmp5 = (s2 - s1)*2 */
1292     felem_diff_128_64(tmp, ftmp6);
1293     /* tmp[i] < 17*2^120 + 2^63 */
1294     felem_reduce(ftmp5, tmp);
1295     y_equal = felem_is_zero(ftmp5);
1296     felem_scalar64(ftmp5, 2);
1297     /* ftmp5[i] < 2^61 */
1298
1299     /*
1300      * The formulae are incorrect if the points are equal, in affine coordinates
1301      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1302      * happens.
1303      *
1304      * We use bitwise operations to avoid potential side-channels introduced by
1305      * the short-circuiting behaviour of boolean operators.
1306      *
1307      * The special case of either point being the point at infinity (z1 and/or
1308      * z2 are zero), is handled separately later on in this function, so we
1309      * avoid jumping to point_double here in those special cases.
1310      *
1311      * Notice the comment below on the implications of this branching for timing
1312      * leaks and why it is considered practically irrelevant.
1313      */
1314     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1315
1316     if (points_equal) {
1317         /*
1318          * This is obviously not constant-time but it will almost-never happen
1319          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1320          * where the intermediate value gets very close to the group order.
1321          * Since |ossl_ec_GFp_nistp_recode_scalar_bits| produces signed digits
1322          * for the scalar, it's possible for the intermediate value to be a small
1323          * negative multiple of the base point, and for the final signed digit
1324          * to be the same value. We believe that this only occurs for the scalar
1325          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1326          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1327          * 71e913863f7, in that case the penultimate intermediate is -9G and
1328          * the final digit is also -9G. Since this only happens for a single
1329          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1330          * check whether a secret scalar was that exact value, can already do
1331          * so.)
1332          */
1333         point_double(x3, y3, z3, x1, y1, z1);
1334         return;
1335     }
1336
1337     /* I = ftmp = (2h)**2 */
1338     felem_assign(ftmp, ftmp4);
1339     felem_scalar64(ftmp, 2);
1340     /* ftmp[i] < 2^61 */
1341     felem_square(tmp, ftmp);
1342     /* tmp[i] < 17*2^122 */
1343     felem_reduce(ftmp, tmp);
1344
1345     /* J = ftmp2 = h * I */
1346     felem_mul(tmp, ftmp4, ftmp);
1347     felem_reduce(ftmp2, tmp);
1348
1349     /* V = ftmp4 = U1 * I */
1350     felem_mul(tmp, ftmp3, ftmp);
1351     felem_reduce(ftmp4, tmp);
1352
1353     /* x_out = r**2 - J - 2V */
1354     felem_square(tmp, ftmp5);
1355     /* tmp[i] < 17*2^122 */
1356     felem_diff_128_64(tmp, ftmp2);
1357     /* tmp[i] < 17*2^122 + 2^63 */
1358     felem_assign(ftmp3, ftmp4);
1359     felem_scalar64(ftmp4, 2);
1360     /* ftmp4[i] < 2^61 */
1361     felem_diff_128_64(tmp, ftmp4);
1362     /* tmp[i] < 17*2^122 + 2^64 */
1363     felem_reduce(x_out, tmp);
1364
1365     /* y_out = r(V-x_out) - 2 * s1 * J */
1366     felem_diff64(ftmp3, x_out);
1367     /*
1368      * ftmp3[i] < 2^60 + 2^60 = 2^61
1369      */
1370     felem_mul(tmp, ftmp5, ftmp3);
1371     /* tmp[i] < 17*2^122 */
1372     felem_mul(tmp2, ftmp6, ftmp2);
1373     /* tmp2[i] < 17*2^120 */
1374     felem_scalar128(tmp2, 2);
1375     /* tmp2[i] < 17*2^121 */
1376     felem_diff128(tmp, tmp2);
1377         /*-
1378          * tmp[i] < 2^127 - 2^69 + 17*2^122
1379          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1380          *        < 2^127
1381          */
1382     felem_reduce(y_out, tmp);
1383
1384     copy_conditional(x_out, x2, z1_is_zero);
1385     copy_conditional(x_out, x1, z2_is_zero);
1386     copy_conditional(y_out, y2, z1_is_zero);
1387     copy_conditional(y_out, y1, z2_is_zero);
1388     copy_conditional(z_out, z2, z1_is_zero);
1389     copy_conditional(z_out, z1, z2_is_zero);
1390     felem_assign(x3, x_out);
1391     felem_assign(y3, y_out);
1392     felem_assign(z3, z_out);
1393 }
1394
1395 /*-
1396  * Base point pre computation
1397  * --------------------------
1398  *
1399  * Two different sorts of precomputed tables are used in the following code.
1400  * Each contain various points on the curve, where each point is three field
1401  * elements (x, y, z).
1402  *
1403  * For the base point table, z is usually 1 (0 for the point at infinity).
1404  * This table has 16 elements:
1405  * index | bits    | point
1406  * ------+---------+------------------------------
1407  *     0 | 0 0 0 0 | 0G
1408  *     1 | 0 0 0 1 | 1G
1409  *     2 | 0 0 1 0 | 2^130G
1410  *     3 | 0 0 1 1 | (2^130 + 1)G
1411  *     4 | 0 1 0 0 | 2^260G
1412  *     5 | 0 1 0 1 | (2^260 + 1)G
1413  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1414  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1415  *     8 | 1 0 0 0 | 2^390G
1416  *     9 | 1 0 0 1 | (2^390 + 1)G
1417  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1418  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1419  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1420  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1421  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1422  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1423  *
1424  * The reason for this is so that we can clock bits into four different
1425  * locations when doing simple scalar multiplies against the base point.
1426  *
1427  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1428
1429 /* gmul is the table of precomputed base points */
1430 static const felem gmul[16][3] = {
1431 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1432  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1433  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1434 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1435   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1436   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1437  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1438   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1439   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1440  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1441 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1442   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1443   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1444  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1445   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1446   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1447  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1448 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1449   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1450   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1451  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1452   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1453   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1454  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1455 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1456   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1457   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1458  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1459   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1460   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1461  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1462 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1463   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1464   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1465  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1466   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1467   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1468  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1469 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1470   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1471   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1472  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1473   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1474   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1475  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1476 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1477   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1478   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1479  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1480   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1481   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1482  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1483 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1484   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1485   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1486  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1487   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1488   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1489  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1490 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1491   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1492   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1493  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1494   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1495   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1496  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1497 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1498   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1499   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1500  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1501   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1502   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1503  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1504 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1505   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1506   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1507  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1508   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1509   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1510  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1511 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1512   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1513   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1514  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1515   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1516   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1517  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1518 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1519   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1520   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1521  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1522   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1523   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1524  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1525 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1526   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1527   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1528  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1529   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1530   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1531  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1532 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1533   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1534   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1535  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1536   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1537   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1538  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1539 };
1540
1541 /*
1542  * select_point selects the |idx|th point from a precomputation table and
1543  * copies it to out.
1544  */
1545  /* pre_comp below is of the size provided in |size| */
1546 static void select_point(const limb idx, unsigned int size,
1547                          const felem pre_comp[][3], felem out[3])
1548 {
1549     unsigned i, j;
1550     limb *outlimbs = &out[0][0];
1551
1552     memset(out, 0, sizeof(*out) * 3);
1553
1554     for (i = 0; i < size; i++) {
1555         const limb *inlimbs = &pre_comp[i][0][0];
1556         limb mask = i ^ idx;
1557         mask |= mask >> 4;
1558         mask |= mask >> 2;
1559         mask |= mask >> 1;
1560         mask &= 1;
1561         mask--;
1562         for (j = 0; j < NLIMBS * 3; j++)
1563             outlimbs[j] |= inlimbs[j] & mask;
1564     }
1565 }
1566
1567 /* get_bit returns the |i|th bit in |in| */
1568 static char get_bit(const felem_bytearray in, int i)
1569 {
1570     if (i < 0)
1571         return 0;
1572     return (in[i >> 3] >> (i & 7)) & 1;
1573 }
1574
1575 /*
1576  * Interleaved point multiplication using precomputed point multiples: The
1577  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1578  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1579  * generator, using certain (large) precomputed multiples in g_pre_comp.
1580  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1581  */
1582 static void batch_mul(felem x_out, felem y_out, felem z_out,
1583                       const felem_bytearray scalars[],
1584                       const unsigned num_points, const u8 *g_scalar,
1585                       const int mixed, const felem pre_comp[][17][3],
1586                       const felem g_pre_comp[16][3])
1587 {
1588     int i, skip;
1589     unsigned num, gen_mul = (g_scalar != NULL);
1590     felem nq[3], tmp[4];
1591     limb bits;
1592     u8 sign, digit;
1593
1594     /* set nq to the point at infinity */
1595     memset(nq, 0, sizeof(nq));
1596
1597     /*
1598      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1599      * of the generator (last quarter of rounds) and additions of other
1600      * points multiples (every 5th round).
1601      */
1602     skip = 1;                   /* save two point operations in the first
1603                                  * round */
1604     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1605         /* double */
1606         if (!skip)
1607             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1608
1609         /* add multiples of the generator */
1610         if (gen_mul && (i <= 130)) {
1611             bits = get_bit(g_scalar, i + 390) << 3;
1612             if (i < 130) {
1613                 bits |= get_bit(g_scalar, i + 260) << 2;
1614                 bits |= get_bit(g_scalar, i + 130) << 1;
1615                 bits |= get_bit(g_scalar, i);
1616             }
1617             /* select the point to add, in constant time */
1618             select_point(bits, 16, g_pre_comp, tmp);
1619             if (!skip) {
1620                 /* The 1 argument below is for "mixed" */
1621                 point_add(nq[0], nq[1], nq[2],
1622                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1623             } else {
1624                 memcpy(nq, tmp, 3 * sizeof(felem));
1625                 skip = 0;
1626             }
1627         }
1628
1629         /* do other additions every 5 doublings */
1630         if (num_points && (i % 5 == 0)) {
1631             /* loop over all scalars */
1632             for (num = 0; num < num_points; ++num) {
1633                 bits = get_bit(scalars[num], i + 4) << 5;
1634                 bits |= get_bit(scalars[num], i + 3) << 4;
1635                 bits |= get_bit(scalars[num], i + 2) << 3;
1636                 bits |= get_bit(scalars[num], i + 1) << 2;
1637                 bits |= get_bit(scalars[num], i) << 1;
1638                 bits |= get_bit(scalars[num], i - 1);
1639                 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1640
1641                 /*
1642                  * select the point to add or subtract, in constant time
1643                  */
1644                 select_point(digit, 17, pre_comp[num], tmp);
1645                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1646                                             * point */
1647                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1648
1649                 if (!skip) {
1650                     point_add(nq[0], nq[1], nq[2],
1651                               nq[0], nq[1], nq[2],
1652                               mixed, tmp[0], tmp[1], tmp[2]);
1653                 } else {
1654                     memcpy(nq, tmp, 3 * sizeof(felem));
1655                     skip = 0;
1656                 }
1657             }
1658         }
1659     }
1660     felem_assign(x_out, nq[0]);
1661     felem_assign(y_out, nq[1]);
1662     felem_assign(z_out, nq[2]);
1663 }
1664
1665 /* Precomputation for the group generator. */
1666 struct nistp521_pre_comp_st {
1667     felem g_pre_comp[16][3];
1668     CRYPTO_REF_COUNT references;
1669     CRYPTO_RWLOCK *lock;
1670 };
1671
1672 const EC_METHOD *EC_GFp_nistp521_method(void)
1673 {
1674     static const EC_METHOD ret = {
1675         EC_FLAGS_DEFAULT_OCT,
1676         NID_X9_62_prime_field,
1677         ossl_ec_GFp_nistp521_group_init,
1678         ossl_ec_GFp_simple_group_finish,
1679         ossl_ec_GFp_simple_group_clear_finish,
1680         ossl_ec_GFp_nist_group_copy,
1681         ossl_ec_GFp_nistp521_group_set_curve,
1682         ossl_ec_GFp_simple_group_get_curve,
1683         ossl_ec_GFp_simple_group_get_degree,
1684         ossl_ec_group_simple_order_bits,
1685         ossl_ec_GFp_simple_group_check_discriminant,
1686         ossl_ec_GFp_simple_point_init,
1687         ossl_ec_GFp_simple_point_finish,
1688         ossl_ec_GFp_simple_point_clear_finish,
1689         ossl_ec_GFp_simple_point_copy,
1690         ossl_ec_GFp_simple_point_set_to_infinity,
1691         ossl_ec_GFp_simple_point_set_affine_coordinates,
1692         ossl_ec_GFp_nistp521_point_get_affine_coordinates,
1693         0 /* point_set_compressed_coordinates */ ,
1694         0 /* point2oct */ ,
1695         0 /* oct2point */ ,
1696         ossl_ec_GFp_simple_add,
1697         ossl_ec_GFp_simple_dbl,
1698         ossl_ec_GFp_simple_invert,
1699         ossl_ec_GFp_simple_is_at_infinity,
1700         ossl_ec_GFp_simple_is_on_curve,
1701         ossl_ec_GFp_simple_cmp,
1702         ossl_ec_GFp_simple_make_affine,
1703         ossl_ec_GFp_simple_points_make_affine,
1704         ossl_ec_GFp_nistp521_points_mul,
1705         ossl_ec_GFp_nistp521_precompute_mult,
1706         ossl_ec_GFp_nistp521_have_precompute_mult,
1707         ossl_ec_GFp_nist_field_mul,
1708         ossl_ec_GFp_nist_field_sqr,
1709         0 /* field_div */ ,
1710         ossl_ec_GFp_simple_field_inv,
1711         0 /* field_encode */ ,
1712         0 /* field_decode */ ,
1713         0,                      /* field_set_to_one */
1714         ossl_ec_key_simple_priv2oct,
1715         ossl_ec_key_simple_oct2priv,
1716         0, /* set private */
1717         ossl_ec_key_simple_generate_key,
1718         ossl_ec_key_simple_check_key,
1719         ossl_ec_key_simple_generate_public_key,
1720         0, /* keycopy */
1721         0, /* keyfinish */
1722         ossl_ecdh_simple_compute_key,
1723         ossl_ecdsa_simple_sign_setup,
1724         ossl_ecdsa_simple_sign_sig,
1725         ossl_ecdsa_simple_verify_sig,
1726         0, /* field_inverse_mod_ord */
1727         0, /* blind_coordinates */
1728         0, /* ladder_pre */
1729         0, /* ladder_step */
1730         0  /* ladder_post */
1731     };
1732
1733     return &ret;
1734 }
1735
1736 /******************************************************************************/
1737 /*
1738  * FUNCTIONS TO MANAGE PRECOMPUTATION
1739  */
1740
1741 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1742 {
1743     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1744
1745     if (ret == NULL) {
1746         ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1747         return ret;
1748     }
1749
1750     ret->references = 1;
1751
1752     ret->lock = CRYPTO_THREAD_lock_new();
1753     if (ret->lock == NULL) {
1754         ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1755         OPENSSL_free(ret);
1756         return NULL;
1757     }
1758     return ret;
1759 }
1760
1761 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1762 {
1763     int i;
1764     if (p != NULL)
1765         CRYPTO_UP_REF(&p->references, &i, p->lock);
1766     return p;
1767 }
1768
1769 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1770 {
1771     int i;
1772
1773     if (p == NULL)
1774         return;
1775
1776     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1777     REF_PRINT_COUNT("EC_nistp521", p);
1778     if (i > 0)
1779         return;
1780     REF_ASSERT_ISNT(i < 0);
1781
1782     CRYPTO_THREAD_lock_free(p->lock);
1783     OPENSSL_free(p);
1784 }
1785
1786 /******************************************************************************/
1787 /*
1788  * OPENSSL EC_METHOD FUNCTIONS
1789  */
1790
1791 int ossl_ec_GFp_nistp521_group_init(EC_GROUP *group)
1792 {
1793     int ret;
1794     ret = ossl_ec_GFp_simple_group_init(group);
1795     group->a_is_minus3 = 1;
1796     return ret;
1797 }
1798
1799 int ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1800                                          const BIGNUM *a, const BIGNUM *b,
1801                                          BN_CTX *ctx)
1802 {
1803     int ret = 0;
1804     BIGNUM *curve_p, *curve_a, *curve_b;
1805 #ifndef FIPS_MODULE
1806     BN_CTX *new_ctx = NULL;
1807
1808     if (ctx == NULL)
1809         ctx = new_ctx = BN_CTX_new();
1810 #endif
1811     if (ctx == NULL)
1812         return 0;
1813
1814     BN_CTX_start(ctx);
1815     curve_p = BN_CTX_get(ctx);
1816     curve_a = BN_CTX_get(ctx);
1817     curve_b = BN_CTX_get(ctx);
1818     if (curve_b == NULL)
1819         goto err;
1820     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1821     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1822     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1823     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1824         ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1825         goto err;
1826     }
1827     group->field_mod_func = BN_nist_mod_521;
1828     ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1829  err:
1830     BN_CTX_end(ctx);
1831 #ifndef FIPS_MODULE
1832     BN_CTX_free(new_ctx);
1833 #endif
1834     return ret;
1835 }
1836
1837 /*
1838  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1839  * (X/Z^2, Y/Z^3)
1840  */
1841 int ossl_ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1842                                                       const EC_POINT *point,
1843                                                       BIGNUM *x, BIGNUM *y,
1844                                                       BN_CTX *ctx)
1845 {
1846     felem z1, z2, x_in, y_in, x_out, y_out;
1847     largefelem tmp;
1848
1849     if (EC_POINT_is_at_infinity(group, point)) {
1850         ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1851         return 0;
1852     }
1853     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1854         (!BN_to_felem(z1, point->Z)))
1855         return 0;
1856     felem_inv(z2, z1);
1857     felem_square(tmp, z2);
1858     felem_reduce(z1, tmp);
1859     felem_mul(tmp, x_in, z1);
1860     felem_reduce(x_in, tmp);
1861     felem_contract(x_out, x_in);
1862     if (x != NULL) {
1863         if (!felem_to_BN(x, x_out)) {
1864             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1865             return 0;
1866         }
1867     }
1868     felem_mul(tmp, z1, z2);
1869     felem_reduce(z1, tmp);
1870     felem_mul(tmp, y_in, z1);
1871     felem_reduce(y_in, tmp);
1872     felem_contract(y_out, y_in);
1873     if (y != NULL) {
1874         if (!felem_to_BN(y, y_out)) {
1875             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1876             return 0;
1877         }
1878     }
1879     return 1;
1880 }
1881
1882 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1883 static void make_points_affine(size_t num, felem points[][3],
1884                                felem tmp_felems[])
1885 {
1886     /*
1887      * Runs in constant time, unless an input is the point at infinity (which
1888      * normally shouldn't happen).
1889      */
1890     ossl_ec_GFp_nistp_points_make_affine_internal(num,
1891                                                   points,
1892                                                   sizeof(felem),
1893                                                   tmp_felems,
1894                                                   (void (*)(void *))felem_one,
1895                                                   felem_is_zero_int,
1896                                                   (void (*)(void *, const void *))
1897                                                   felem_assign,
1898                                                   (void (*)(void *, const void *))
1899                                                   felem_square_reduce, (void (*)
1900                                                                         (void *,
1901                                                                          const void
1902                                                                          *,
1903                                                                          const void
1904                                                                          *))
1905                                                   felem_mul_reduce,
1906                                                   (void (*)(void *, const void *))
1907                                                   felem_inv,
1908                                                   (void (*)(void *, const void *))
1909                                                   felem_contract);
1910 }
1911
1912 /*
1913  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1914  * values Result is stored in r (r can equal one of the inputs).
1915  */
1916 int ossl_ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1917                                     const BIGNUM *scalar, size_t num,
1918                                     const EC_POINT *points[],
1919                                     const BIGNUM *scalars[], BN_CTX *ctx)
1920 {
1921     int ret = 0;
1922     int j;
1923     int mixed = 0;
1924     BIGNUM *x, *y, *z, *tmp_scalar;
1925     felem_bytearray g_secret;
1926     felem_bytearray *secrets = NULL;
1927     felem (*pre_comp)[17][3] = NULL;
1928     felem *tmp_felems = NULL;
1929     unsigned i;
1930     int num_bytes;
1931     int have_pre_comp = 0;
1932     size_t num_points = num;
1933     felem x_in, y_in, z_in, x_out, y_out, z_out;
1934     NISTP521_PRE_COMP *pre = NULL;
1935     felem(*g_pre_comp)[3] = NULL;
1936     EC_POINT *generator = NULL;
1937     const EC_POINT *p = NULL;
1938     const BIGNUM *p_scalar = NULL;
1939
1940     BN_CTX_start(ctx);
1941     x = BN_CTX_get(ctx);
1942     y = BN_CTX_get(ctx);
1943     z = BN_CTX_get(ctx);
1944     tmp_scalar = BN_CTX_get(ctx);
1945     if (tmp_scalar == NULL)
1946         goto err;
1947
1948     if (scalar != NULL) {
1949         pre = group->pre_comp.nistp521;
1950         if (pre)
1951             /* we have precomputation, try to use it */
1952             g_pre_comp = &pre->g_pre_comp[0];
1953         else
1954             /* try to use the standard precomputation */
1955             g_pre_comp = (felem(*)[3]) gmul;
1956         generator = EC_POINT_new(group);
1957         if (generator == NULL)
1958             goto err;
1959         /* get the generator from precomputation */
1960         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1961             !felem_to_BN(y, g_pre_comp[1][1]) ||
1962             !felem_to_BN(z, g_pre_comp[1][2])) {
1963             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1964             goto err;
1965         }
1966         if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1967                                                                 generator,
1968                                                                 x, y, z, ctx))
1969             goto err;
1970         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1971             /* precomputation matches generator */
1972             have_pre_comp = 1;
1973         else
1974             /*
1975              * we don't have valid precomputation: treat the generator as a
1976              * random point
1977              */
1978             num_points++;
1979     }
1980
1981     if (num_points > 0) {
1982         if (num_points >= 2) {
1983             /*
1984              * unless we precompute multiples for just one point, converting
1985              * those into affine form is time well spent
1986              */
1987             mixed = 1;
1988         }
1989         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1990         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1991         if (mixed)
1992             tmp_felems =
1993                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1994         if ((secrets == NULL) || (pre_comp == NULL)
1995             || (mixed && (tmp_felems == NULL))) {
1996             ERR_raise(ERR_LIB_EC, ERR_R_MALLOC_FAILURE);
1997             goto err;
1998         }
1999
2000         /*
2001          * we treat NULL scalars as 0, and NULL points as points at infinity,
2002          * i.e., they contribute nothing to the linear combination
2003          */
2004         for (i = 0; i < num_points; ++i) {
2005             if (i == num) {
2006                 /*
2007                  * we didn't have a valid precomputation, so we pick the
2008                  * generator
2009                  */
2010                 p = EC_GROUP_get0_generator(group);
2011                 p_scalar = scalar;
2012             } else {
2013                 /* the i^th point */
2014                 p = points[i];
2015                 p_scalar = scalars[i];
2016             }
2017             if ((p_scalar != NULL) && (p != NULL)) {
2018                 /* reduce scalar to 0 <= scalar < 2^521 */
2019                 if ((BN_num_bits(p_scalar) > 521)
2020                     || (BN_is_negative(p_scalar))) {
2021                     /*
2022                      * this is an unusual input, and we don't guarantee
2023                      * constant-timeness
2024                      */
2025                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2026                         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2027                         goto err;
2028                     }
2029                     num_bytes = BN_bn2lebinpad(tmp_scalar,
2030                                                secrets[i], sizeof(secrets[i]));
2031                 } else {
2032                     num_bytes = BN_bn2lebinpad(p_scalar,
2033                                                secrets[i], sizeof(secrets[i]));
2034                 }
2035                 if (num_bytes < 0) {
2036                     ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2037                     goto err;
2038                 }
2039                 /* precompute multiples */
2040                 if ((!BN_to_felem(x_out, p->X)) ||
2041                     (!BN_to_felem(y_out, p->Y)) ||
2042                     (!BN_to_felem(z_out, p->Z)))
2043                     goto err;
2044                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
2045                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
2046                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
2047                 for (j = 2; j <= 16; ++j) {
2048                     if (j & 1) {
2049                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
2050                                   pre_comp[i][j][2], pre_comp[i][1][0],
2051                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
2052                                   pre_comp[i][j - 1][0],
2053                                   pre_comp[i][j - 1][1],
2054                                   pre_comp[i][j - 1][2]);
2055                     } else {
2056                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2057                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2058                                      pre_comp[i][j / 2][1],
2059                                      pre_comp[i][j / 2][2]);
2060                     }
2061                 }
2062             }
2063         }
2064         if (mixed)
2065             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2066     }
2067
2068     /* the scalar for the generator */
2069     if ((scalar != NULL) && (have_pre_comp)) {
2070         memset(g_secret, 0, sizeof(g_secret));
2071         /* reduce scalar to 0 <= scalar < 2^521 */
2072         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2073             /*
2074              * this is an unusual input, and we don't guarantee
2075              * constant-timeness
2076              */
2077             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2078                 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2079                 goto err;
2080             }
2081             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2082         } else {
2083             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2084         }
2085         /* do the multiplication with generator precomputation */
2086         batch_mul(x_out, y_out, z_out,
2087                   (const felem_bytearray(*))secrets, num_points,
2088                   g_secret,
2089                   mixed, (const felem(*)[17][3])pre_comp,
2090                   (const felem(*)[3])g_pre_comp);
2091     } else {
2092         /* do the multiplication without generator precomputation */
2093         batch_mul(x_out, y_out, z_out,
2094                   (const felem_bytearray(*))secrets, num_points,
2095                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2096     }
2097     /* reduce the output to its unique minimal representation */
2098     felem_contract(x_in, x_out);
2099     felem_contract(y_in, y_out);
2100     felem_contract(z_in, z_out);
2101     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2102         (!felem_to_BN(z, z_in))) {
2103         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2104         goto err;
2105     }
2106     ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
2107                                                              ctx);
2108
2109  err:
2110     BN_CTX_end(ctx);
2111     EC_POINT_free(generator);
2112     OPENSSL_free(secrets);
2113     OPENSSL_free(pre_comp);
2114     OPENSSL_free(tmp_felems);
2115     return ret;
2116 }
2117
2118 int ossl_ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2119 {
2120     int ret = 0;
2121     NISTP521_PRE_COMP *pre = NULL;
2122     int i, j;
2123     BIGNUM *x, *y;
2124     EC_POINT *generator = NULL;
2125     felem tmp_felems[16];
2126 #ifndef FIPS_MODULE
2127     BN_CTX *new_ctx = NULL;
2128 #endif
2129
2130     /* throw away old precomputation */
2131     EC_pre_comp_free(group);
2132
2133 #ifndef FIPS_MODULE
2134     if (ctx == NULL)
2135         ctx = new_ctx = BN_CTX_new();
2136 #endif
2137     if (ctx == NULL)
2138         return 0;
2139
2140     BN_CTX_start(ctx);
2141     x = BN_CTX_get(ctx);
2142     y = BN_CTX_get(ctx);
2143     if (y == NULL)
2144         goto err;
2145     /* get the generator */
2146     if (group->generator == NULL)
2147         goto err;
2148     generator = EC_POINT_new(group);
2149     if (generator == NULL)
2150         goto err;
2151     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2152     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2153     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2154         goto err;
2155     if ((pre = nistp521_pre_comp_new()) == NULL)
2156         goto err;
2157     /*
2158      * if the generator is the standard one, use built-in precomputation
2159      */
2160     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2161         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2162         goto done;
2163     }
2164     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2165         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2166         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2167         goto err;
2168     /* compute 2^130*G, 2^260*G, 2^390*G */
2169     for (i = 1; i <= 4; i <<= 1) {
2170         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2171                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2172                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2173         for (j = 0; j < 129; ++j) {
2174             point_double(pre->g_pre_comp[2 * i][0],
2175                          pre->g_pre_comp[2 * i][1],
2176                          pre->g_pre_comp[2 * i][2],
2177                          pre->g_pre_comp[2 * i][0],
2178                          pre->g_pre_comp[2 * i][1],
2179                          pre->g_pre_comp[2 * i][2]);
2180         }
2181     }
2182     /* g_pre_comp[0] is the point at infinity */
2183     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2184     /* the remaining multiples */
2185     /* 2^130*G + 2^260*G */
2186     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2187               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2188               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2189               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2190               pre->g_pre_comp[2][2]);
2191     /* 2^130*G + 2^390*G */
2192     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2193               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2194               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2195               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2196               pre->g_pre_comp[2][2]);
2197     /* 2^260*G + 2^390*G */
2198     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2199               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2200               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2201               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2202               pre->g_pre_comp[4][2]);
2203     /* 2^130*G + 2^260*G + 2^390*G */
2204     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2205               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2206               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2207               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2208               pre->g_pre_comp[2][2]);
2209     for (i = 1; i < 8; ++i) {
2210         /* odd multiples: add G */
2211         point_add(pre->g_pre_comp[2 * i + 1][0],
2212                   pre->g_pre_comp[2 * i + 1][1],
2213                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2214                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2215                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2216                   pre->g_pre_comp[1][2]);
2217     }
2218     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2219
2220  done:
2221     SETPRECOMP(group, nistp521, pre);
2222     ret = 1;
2223     pre = NULL;
2224  err:
2225     BN_CTX_end(ctx);
2226     EC_POINT_free(generator);
2227 #ifndef FIPS_MODULE
2228     BN_CTX_free(new_ctx);
2229 #endif
2230     EC_nistp521_pre_comp_free(pre);
2231     return ret;
2232 }
2233
2234 int ossl_ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2235 {
2236     return HAVEPRECOMP(group, nistp521);
2237 }