crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 #ifndef OPENSSL_SYS_VMS
  33 #include <stdint.h>
  34 #else
  35 #include <inttypes.h>
  36 #endif
  37
  38 #include <string.h>
  39 #include <openssl/err.h>
  40 #include "ec_lcl.h"
  41
  42 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44   typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
  45 #else
  46   #error "Need GCC 3.1 or later to define type uint128_t"
  47 #endif
  48
  49 typedef uint8_t u8;
  50 typedef uint64_t u64;
  51 typedef int64_t s64;
  52
  53 /* The underlying field.
  54  *
  55  * P521 operates over GF(2^521-1). We can serialise an element of this field
  56  * into 66 bytes where the most significant byte contains only a single bit. We
  57  * call this an felem_bytearray. */
  58
  59 typedef u8 felem_bytearray[66];
  60
  61 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  62  * These values are big-endian. */
  63 static const felem_bytearray nistp521_curve_params[5] =
  64         {
  65         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
  66          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  67          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73          0xff, 0xff},
  74         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
  75          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82          0xff, 0xfc},
  83         {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
  84          0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  85          0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  86          0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  87          0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  88          0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  89          0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  90          0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  91          0x3f, 0x00},
  92         {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
  93          0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  94          0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  95          0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  96          0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  97          0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  98          0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
  99          0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 100          0xbd, 0x66},
 101         {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
 102          0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 103          0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 104          0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 105          0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 106          0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 107          0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 108          0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 109          0x66, 0x50}
 110         };
 111
 112 /* The representation of field elements.
 113  * ------------------------------------
 114  *
 115  * We represent field elements with nine values. These values are either 64 or
 116  * 128 bits and the field element represented is:
 117  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 118  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 119  * 58 bits apart, but are greater than 58 bits in length, the most significant
 120  * bits of each limb overlap with the least significant bits of the next.
 121  *
 122  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 123  * 'largefelem' */
 124
 125 #define NLIMBS 9
 126
 127 typedef uint64_t limb;
 128 typedef limb felem[NLIMBS];
 129 typedef uint128_t largefelem[NLIMBS];
 130
 131 static const limb bottom57bits = 0x1ffffffffffffff;
 132 static const limb bottom58bits = 0x3ffffffffffffff;
 133
 134 /* bin66_to_felem takes a little-endian byte array and converts it into felem
 135  * form. This assumes that the CPU is little-endian. */
 136 static void bin66_to_felem(felem out, const u8 in[66])
 137         {
 138         out[0] = (*((limb*) &in[0])) & bottom58bits;
 139         out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
 140         out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
 141         out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
 142         out[4] = (*((limb*) &in[29])) & bottom58bits;
 143         out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
 144         out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
 145         out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
 146         out[8] = (*((limb*) &in[58])) & bottom57bits;
 147         }
 148
 149 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 150  * array. This assumes that the CPU is little-endian. */
 151 static void felem_to_bin66(u8 out[66], const felem in)
 152         {
 153         memset(out, 0, 66);
 154         (*((limb*) &out[0])) = in[0];
 155         (*((limb*) &out[7])) |= in[1] << 2;
 156         (*((limb*) &out[14])) |= in[2] << 4;
 157         (*((limb*) &out[21])) |= in[3] << 6;
 158         (*((limb*) &out[29])) = in[4];
 159         (*((limb*) &out[36])) |= in[5] << 2;
 160         (*((limb*) &out[43])) |= in[6] << 4;
 161         (*((limb*) &out[50])) |= in[7] << 6;
 162         (*((limb*) &out[58])) = in[8];
 163         }
 164
 165 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 166 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 167         {
 168         unsigned i;
 169         for (i = 0; i < len; ++i)
 170                 out[i] = in[len-1-i];
 171         }
 172
 173 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 174 static int BN_to_felem(felem out, const BIGNUM *bn)
 175         {
 176         felem_bytearray b_in;
 177         felem_bytearray b_out;
 178         unsigned num_bytes;
 179
 180         /* BN_bn2bin eats leading zeroes */
 181         memset(b_out, 0, sizeof b_out);
 182         num_bytes = BN_num_bytes(bn);
 183         if (num_bytes > sizeof b_out)
 184                 {
 185                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 186                 return 0;
 187                 }
 188         if (BN_is_negative(bn))
 189                 {
 190                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 191                 return 0;
 192                 }
 193         num_bytes = BN_bn2bin(bn, b_in);
 194         flip_endian(b_out, b_in, num_bytes);
 195         bin66_to_felem(out, b_out);
 196         return 1;
 197         }
 198
 199 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 200 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 201         {
 202         felem_bytearray b_in, b_out;
 203         felem_to_bin66(b_in, in);
 204         flip_endian(b_out, b_in, sizeof b_out);
 205         return BN_bin2bn(b_out, sizeof b_out, out);
 206         }
 207
 208
 209 /* Field operations
 210  * ---------------- */
 211
 212 static void felem_one(felem out)
 213         {
 214         out[0] = 1;
 215         out[1] = 0;
 216         out[2] = 0;
 217         out[3] = 0;
 218         out[4] = 0;
 219         out[5] = 0;
 220         out[6] = 0;
 221         out[7] = 0;
 222         out[8] = 0;
 223         }
 224
 225 static void felem_assign(felem out, const felem in)
 226         {
 227         out[0] = in[0];
 228         out[1] = in[1];
 229         out[2] = in[2];
 230         out[3] = in[3];
 231         out[4] = in[4];
 232         out[5] = in[5];
 233         out[6] = in[6];
 234         out[7] = in[7];
 235         out[8] = in[8];
 236         }
 237
 238 /* felem_sum64 sets out = out + in. */
 239 static void felem_sum64(felem out, const felem in)
 240         {
 241         out[0] += in[0];
 242         out[1] += in[1];
 243         out[2] += in[2];
 244         out[3] += in[3];
 245         out[4] += in[4];
 246         out[5] += in[5];
 247         out[6] += in[6];
 248         out[7] += in[7];
 249         out[8] += in[8];
 250         }
 251
 252 /* felem_scalar sets out = in * scalar */
 253 static void felem_scalar(felem out, const felem in, limb scalar)
 254         {
 255         out[0] = in[0] * scalar;
 256         out[1] = in[1] * scalar;
 257         out[2] = in[2] * scalar;
 258         out[3] = in[3] * scalar;
 259         out[4] = in[4] * scalar;
 260         out[5] = in[5] * scalar;
 261         out[6] = in[6] * scalar;
 262         out[7] = in[7] * scalar;
 263         out[8] = in[8] * scalar;
 264         }
 265
 266 /* felem_scalar64 sets out = out * scalar */
 267 static void felem_scalar64(felem out, limb scalar)
 268         {
 269         out[0] *= scalar;
 270         out[1] *= scalar;
 271         out[2] *= scalar;
 272         out[3] *= scalar;
 273         out[4] *= scalar;
 274         out[5] *= scalar;
 275         out[6] *= scalar;
 276         out[7] *= scalar;
 277         out[8] *= scalar;
 278         }
 279
 280 /* felem_scalar128 sets out = out * scalar */
 281 static void felem_scalar128(largefelem out, limb scalar)
 282         {
 283         out[0] *= scalar;
 284         out[1] *= scalar;
 285         out[2] *= scalar;
 286         out[3] *= scalar;
 287         out[4] *= scalar;
 288         out[5] *= scalar;
 289         out[6] *= scalar;
 290         out[7] *= scalar;
 291         out[8] *= scalar;
 292         }
 293
 294 /* felem_neg sets |out| to |-in|
 295  * On entry:
 296  *   in[i] < 2^59 + 2^14
 297  * On exit:
 298  *   out[i] < 2^62
 299  */
 300 static void felem_neg(felem out, const felem in)
 301         {
 302         /* In order to prevent underflow, we subtract from 0 mod p. */
 303         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 304         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 305
 306         out[0] = two62m3 - in[0];
 307         out[1] = two62m2 - in[1];
 308         out[2] = two62m2 - in[2];
 309         out[3] = two62m2 - in[3];
 310         out[4] = two62m2 - in[4];
 311         out[5] = two62m2 - in[5];
 312         out[6] = two62m2 - in[6];
 313         out[7] = two62m2 - in[7];
 314         out[8] = two62m2 - in[8];
 315         }
 316
 317 /* felem_diff64 subtracts |in| from |out|
 318  * On entry:
 319  *   in[i] < 2^59 + 2^14
 320  * On exit:
 321  *   out[i] < out[i] + 2^62
 322  */
 323 static void felem_diff64(felem out, const felem in)
 324         {
 325         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 326         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 327         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 328
 329         out[0] += two62m3 - in[0];
 330         out[1] += two62m2 - in[1];
 331         out[2] += two62m2 - in[2];
 332         out[3] += two62m2 - in[3];
 333         out[4] += two62m2 - in[4];
 334         out[5] += two62m2 - in[5];
 335         out[6] += two62m2 - in[6];
 336         out[7] += two62m2 - in[7];
 337         out[8] += two62m2 - in[8];
 338         }
 339
 340 /* felem_diff_128_64 subtracts |in| from |out|
 341  * On entry:
 342  *   in[i] < 2^62 + 2^17
 343  * On exit:
 344  *   out[i] < out[i] + 2^63
 345  */
 346 static void felem_diff_128_64(largefelem out, const felem in)
 347         {
 348         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 349         static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
 350         static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
 351
 352         out[0] += two63m6 - in[0];
 353         out[1] += two63m5 - in[1];
 354         out[2] += two63m5 - in[2];
 355         out[3] += two63m5 - in[3];
 356         out[4] += two63m5 - in[4];
 357         out[5] += two63m5 - in[5];
 358         out[6] += two63m5 - in[6];
 359         out[7] += two63m5 - in[7];
 360         out[8] += two63m5 - in[8];
 361         }
 362
 363 /* felem_diff_128_64 subtracts |in| from |out|
 364  * On entry:
 365  *   in[i] < 2^126
 366  * On exit:
 367  *   out[i] < out[i] + 2^127 - 2^69
 368  */
 369 static void felem_diff128(largefelem out, const largefelem in)
 370         {
 371         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 372         static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
 373         static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
 374
 375         out[0] += (two127m70 - in[0]);
 376         out[1] += (two127m69 - in[1]);
 377         out[2] += (two127m69 - in[2]);
 378         out[3] += (two127m69 - in[3]);
 379         out[4] += (two127m69 - in[4]);
 380         out[5] += (two127m69 - in[5]);
 381         out[6] += (two127m69 - in[6]);
 382         out[7] += (two127m69 - in[7]);
 383         out[8] += (two127m69 - in[8]);
 384         }
 385
 386 /* felem_square sets |out| = |in|^2
 387  * On entry:
 388  *   in[i] < 2^62
 389  * On exit:
 390  *   out[i] < 17 * max(in[i]) * max(in[i])
 391  */
 392 static void felem_square(largefelem out, const felem in)
 393         {
 394         felem inx2, inx4;
 395         felem_scalar(inx2, in, 2);
 396         felem_scalar(inx4, in, 4);
 397
 398         /* We have many cases were we want to do
 399          *   in[x] * in[y] +
 400          *   in[y] * in[x]
 401          * This is obviously just
 402          *   2 * in[x] * in[y]
 403          * However, rather than do the doubling on the 128 bit result, we
 404          * double one of the inputs to the multiplication by reading from
 405          * |inx2| */
 406
 407         out[0] = ((uint128_t) in[0]) * in[0];
 408         out[1] = ((uint128_t) in[0]) * inx2[1];
 409         out[2] = ((uint128_t) in[0]) * inx2[2] +
 410                  ((uint128_t) in[1]) * in[1];
 411         out[3] = ((uint128_t) in[0]) * inx2[3] +
 412                  ((uint128_t) in[1]) * inx2[2];
 413         out[4] = ((uint128_t) in[0]) * inx2[4] +
 414                  ((uint128_t) in[1]) * inx2[3] +
 415                  ((uint128_t) in[2]) * in[2];
 416         out[5] = ((uint128_t) in[0]) * inx2[5] +
 417                  ((uint128_t) in[1]) * inx2[4] +
 418                  ((uint128_t) in[2]) * inx2[3];
 419         out[6] = ((uint128_t) in[0]) * inx2[6] +
 420                  ((uint128_t) in[1]) * inx2[5] +
 421                  ((uint128_t) in[2]) * inx2[4] +
 422                  ((uint128_t) in[3]) * in[3];
 423         out[7] = ((uint128_t) in[0]) * inx2[7] +
 424                  ((uint128_t) in[1]) * inx2[6] +
 425                  ((uint128_t) in[2]) * inx2[5] +
 426                  ((uint128_t) in[3]) * inx2[4];
 427         out[8] = ((uint128_t) in[0]) * inx2[8] +
 428                  ((uint128_t) in[1]) * inx2[7] +
 429                  ((uint128_t) in[2]) * inx2[6] +
 430                  ((uint128_t) in[3]) * inx2[5] +
 431                  ((uint128_t) in[4]) * in[4];
 432
 433         /* The remaining limbs fall above 2^521, with the first falling at
 434          * 2^522. They correspond to locations one bit up from the limbs
 435          * produced above so we would have to multiply by two to align them.
 436          * Again, rather than operate on the 128-bit result, we double one of
 437          * the inputs to the multiplication. If we want to double for both this
 438          * reason, and the reason above, then we end up multiplying by four. */
 439
 440         /* 9 */
 441         out[0] += ((uint128_t) in[1]) * inx4[8] +
 442                   ((uint128_t) in[2]) * inx4[7] +
 443                   ((uint128_t) in[3]) * inx4[6] +
 444                   ((uint128_t) in[4]) * inx4[5];
 445
 446         /* 10 */
 447         out[1] += ((uint128_t) in[2]) * inx4[8] +
 448                   ((uint128_t) in[3]) * inx4[7] +
 449                   ((uint128_t) in[4]) * inx4[6] +
 450                   ((uint128_t) in[5]) * inx2[5];
 451
 452         /* 11 */
 453         out[2] += ((uint128_t) in[3]) * inx4[8] +
 454                   ((uint128_t) in[4]) * inx4[7] +
 455                   ((uint128_t) in[5]) * inx4[6];
 456
 457         /* 12 */
 458         out[3] += ((uint128_t) in[4]) * inx4[8] +
 459                   ((uint128_t) in[5]) * inx4[7] +
 460                   ((uint128_t) in[6]) * inx2[6];
 461
 462         /* 13 */
 463         out[4] += ((uint128_t) in[5]) * inx4[8] +
 464                   ((uint128_t) in[6]) * inx4[7];
 465
 466         /* 14 */
 467         out[5] += ((uint128_t) in[6]) * inx4[8] +
 468                   ((uint128_t) in[7]) * inx2[7];
 469
 470         /* 15 */
 471         out[6] += ((uint128_t) in[7]) * inx4[8];
 472
 473         /* 16 */
 474         out[7] += ((uint128_t) in[8]) * inx2[8];
 475         }
 476
 477 /* felem_mul sets |out| = |in1| * |in2|
 478  * On entry:
 479  *   in1[i] < 2^64
 480  *   in2[i] < 2^63
 481  * On exit:
 482  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 483  */
 484 static void felem_mul(largefelem out, const felem in1, const felem in2)
 485         {
 486         felem in2x2;
 487         felem_scalar(in2x2, in2, 2);
 488
 489         out[0] = ((uint128_t) in1[0]) * in2[0];
 490
 491         out[1] = ((uint128_t) in1[0]) * in2[1] +
 492                  ((uint128_t) in1[1]) * in2[0];
 493
 494         out[2] = ((uint128_t) in1[0]) * in2[2] +
 495                  ((uint128_t) in1[1]) * in2[1] +
 496                  ((uint128_t) in1[2]) * in2[0];
 497
 498         out[3] = ((uint128_t) in1[0]) * in2[3] +
 499                  ((uint128_t) in1[1]) * in2[2] +
 500                  ((uint128_t) in1[2]) * in2[1] +
 501                  ((uint128_t) in1[3]) * in2[0];
 502
 503         out[4] = ((uint128_t) in1[0]) * in2[4] +
 504                  ((uint128_t) in1[1]) * in2[3] +
 505                  ((uint128_t) in1[2]) * in2[2] +
 506                  ((uint128_t) in1[3]) * in2[1] +
 507                  ((uint128_t) in1[4]) * in2[0];
 508
 509         out[5] = ((uint128_t) in1[0]) * in2[5] +
 510                  ((uint128_t) in1[1]) * in2[4] +
 511                  ((uint128_t) in1[2]) * in2[3] +
 512                  ((uint128_t) in1[3]) * in2[2] +
 513                  ((uint128_t) in1[4]) * in2[1] +
 514                  ((uint128_t) in1[5]) * in2[0];
 515
 516         out[6] = ((uint128_t) in1[0]) * in2[6] +
 517                  ((uint128_t) in1[1]) * in2[5] +
 518                  ((uint128_t) in1[2]) * in2[4] +
 519                  ((uint128_t) in1[3]) * in2[3] +
 520                  ((uint128_t) in1[4]) * in2[2] +
 521                  ((uint128_t) in1[5]) * in2[1] +
 522                  ((uint128_t) in1[6]) * in2[0];
 523
 524         out[7] = ((uint128_t) in1[0]) * in2[7] +
 525                  ((uint128_t) in1[1]) * in2[6] +
 526                  ((uint128_t) in1[2]) * in2[5] +
 527                  ((uint128_t) in1[3]) * in2[4] +
 528                  ((uint128_t) in1[4]) * in2[3] +
 529                  ((uint128_t) in1[5]) * in2[2] +
 530                  ((uint128_t) in1[6]) * in2[1] +
 531                  ((uint128_t) in1[7]) * in2[0];
 532
 533         out[8] = ((uint128_t) in1[0]) * in2[8] +
 534                  ((uint128_t) in1[1]) * in2[7] +
 535                  ((uint128_t) in1[2]) * in2[6] +
 536                  ((uint128_t) in1[3]) * in2[5] +
 537                  ((uint128_t) in1[4]) * in2[4] +
 538                  ((uint128_t) in1[5]) * in2[3] +
 539                  ((uint128_t) in1[6]) * in2[2] +
 540                  ((uint128_t) in1[7]) * in2[1] +
 541                  ((uint128_t) in1[8]) * in2[0];
 542
 543         /* See comment in felem_square about the use of in2x2 here */
 544
 545         out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 546                   ((uint128_t) in1[2]) * in2x2[7] +
 547                   ((uint128_t) in1[3]) * in2x2[6] +
 548                   ((uint128_t) in1[4]) * in2x2[5] +
 549                   ((uint128_t) in1[5]) * in2x2[4] +
 550                   ((uint128_t) in1[6]) * in2x2[3] +
 551                   ((uint128_t) in1[7]) * in2x2[2] +
 552                   ((uint128_t) in1[8]) * in2x2[1];
 553
 554         out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 555                   ((uint128_t) in1[3]) * in2x2[7] +
 556                   ((uint128_t) in1[4]) * in2x2[6] +
 557                   ((uint128_t) in1[5]) * in2x2[5] +
 558                   ((uint128_t) in1[6]) * in2x2[4] +
 559                   ((uint128_t) in1[7]) * in2x2[3] +
 560                   ((uint128_t) in1[8]) * in2x2[2];
 561
 562         out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 563                   ((uint128_t) in1[4]) * in2x2[7] +
 564                   ((uint128_t) in1[5]) * in2x2[6] +
 565                   ((uint128_t) in1[6]) * in2x2[5] +
 566                   ((uint128_t) in1[7]) * in2x2[4] +
 567                   ((uint128_t) in1[8]) * in2x2[3];
 568
 569         out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 570                   ((uint128_t) in1[5]) * in2x2[7] +
 571                   ((uint128_t) in1[6]) * in2x2[6] +
 572                   ((uint128_t) in1[7]) * in2x2[5] +
 573                   ((uint128_t) in1[8]) * in2x2[4];
 574
 575         out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 576                   ((uint128_t) in1[6]) * in2x2[7] +
 577                   ((uint128_t) in1[7]) * in2x2[6] +
 578                   ((uint128_t) in1[8]) * in2x2[5];
 579
 580         out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 581                   ((uint128_t) in1[7]) * in2x2[7] +
 582                   ((uint128_t) in1[8]) * in2x2[6];
 583
 584         out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 585                   ((uint128_t) in1[8]) * in2x2[7];
 586
 587         out[7] += ((uint128_t) in1[8]) * in2x2[8];
 588         }
 589
 590 static const limb bottom52bits = 0xfffffffffffff;
 591
 592 /* felem_reduce converts a largefelem to an felem.
 593  * On entry:
 594  *   in[i] < 2^128
 595  * On exit:
 596  *   out[i] < 2^59 + 2^14
 597  */
 598 static void felem_reduce(felem out, const largefelem in)
 599         {
 600         u64 overflow1, overflow2;
 601
 602         out[0] = ((limb) in[0]) & bottom58bits;
 603         out[1] = ((limb) in[1]) & bottom58bits;
 604         out[2] = ((limb) in[2]) & bottom58bits;
 605         out[3] = ((limb) in[3]) & bottom58bits;
 606         out[4] = ((limb) in[4]) & bottom58bits;
 607         out[5] = ((limb) in[5]) & bottom58bits;
 608         out[6] = ((limb) in[6]) & bottom58bits;
 609         out[7] = ((limb) in[7]) & bottom58bits;
 610         out[8] = ((limb) in[8]) & bottom58bits;
 611
 612         /* out[i] < 2^58 */
 613
 614         out[1] += ((limb) in[0]) >> 58;
 615         out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 616         /* out[1] < 2^58 + 2^6 + 2^58
 617          *        = 2^59 + 2^6 */
 618         out[2] += ((limb) (in[0] >> 64)) >> 52;
 619
 620         out[2] += ((limb) in[1]) >> 58;
 621         out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 622         out[3] += ((limb) (in[1] >> 64)) >> 52;
 623
 624         out[3] += ((limb) in[2]) >> 58;
 625         out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 626         out[4] += ((limb) (in[2] >> 64)) >> 52;
 627
 628         out[4] += ((limb) in[3]) >> 58;
 629         out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 630         out[5] += ((limb) (in[3] >> 64)) >> 52;
 631
 632         out[5] += ((limb) in[4]) >> 58;
 633         out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 634         out[6] += ((limb) (in[4] >> 64)) >> 52;
 635
 636         out[6] += ((limb) in[5]) >> 58;
 637         out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 638         out[7] += ((limb) (in[5] >> 64)) >> 52;
 639
 640         out[7] += ((limb) in[6]) >> 58;
 641         out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 642         out[8] += ((limb) (in[6] >> 64)) >> 52;
 643
 644         out[8] += ((limb) in[7]) >> 58;
 645         out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 646         /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 647          *            < 2^59 + 2^13 */
 648         overflow1 = ((limb) (in[7] >> 64)) >> 52;
 649
 650         overflow1 += ((limb) in[8]) >> 58;
 651         overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 652         overflow2 = ((limb) (in[8] >> 64)) >> 52;
 653
 654         overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
 655         overflow2 <<= 1;  /* overflow2 < 2^13 */
 656
 657         out[0] += overflow1;  /* out[0] < 2^60 */
 658         out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
 659
 660         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 661         /* out[0] < 2^58
 662          * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 663          *        < 2^59 + 2^14 */
 664         }
 665
 666 static void felem_square_reduce(felem out, const felem in)
 667         {
 668         largefelem tmp;
 669         felem_square(tmp, in);
 670         felem_reduce(out, tmp);
 671         }
 672
 673 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 674         {
 675         largefelem tmp;
 676         felem_mul(tmp, in1, in2);
 677         felem_reduce(out, tmp);
 678         }
 679
 680 /* felem_inv calculates |out| = |in|^{-1}
 681  *
 682  * Based on Fermat's Little Theorem:
 683  *   a^p = a (mod p)
 684  *   a^{p-1} = 1 (mod p)
 685  *   a^{p-2} = a^{-1} (mod p)
 686  */
 687 static void felem_inv(felem out, const felem in)
 688         {
 689         felem ftmp, ftmp2, ftmp3, ftmp4;
 690         largefelem tmp;
 691         unsigned i;
 692
 693         felem_square(tmp, in); felem_reduce(ftmp, tmp);         /* 2^1 */
 694         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^2 - 2^0 */
 695         felem_assign(ftmp2, ftmp);
 696         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^3 - 2^1 */
 697         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^3 - 2^0 */
 698         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^4 - 2^1 */
 699
 700         felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^3 - 2^1 */
 701         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^4 - 2^2 */
 702         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
 703
 704         felem_assign(ftmp2, ftmp3);
 705         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^5 - 2^1 */
 706         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^6 - 2^2 */
 707         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^7 - 2^3 */
 708         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^8 - 2^4 */
 709         felem_assign(ftmp4, ftmp3);
 710         felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);  /* 2^8 - 2^1 */
 711         felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);     /* 2^9 - 2^2 */
 712         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
 713         felem_assign(ftmp2, ftmp3);
 714
 715         for (i = 0; i < 8; i++)
 716                 {
 717                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^16 - 2^8 */
 718                 }
 719         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
 720         felem_assign(ftmp2, ftmp3);
 721
 722         for (i = 0; i < 16; i++)
 723                 {
 724                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^32 - 2^16 */
 725                 }
 726         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
 727         felem_assign(ftmp2, ftmp3);
 728
 729         for (i = 0; i < 32; i++)
 730                 {
 731                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^64 - 2^32 */
 732                 }
 733         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
 734         felem_assign(ftmp2, ftmp3);
 735
 736         for (i = 0; i < 64; i++)
 737                 {
 738                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^128 - 2^64 */
 739                 }
 740         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
 741         felem_assign(ftmp2, ftmp3);
 742
 743         for (i = 0; i < 128; i++)
 744                 {
 745                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^256 - 2^128 */
 746                 }
 747         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
 748         felem_assign(ftmp2, ftmp3);
 749
 750         for (i = 0; i < 256; i++)
 751                 {
 752                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^512 - 2^256 */
 753                 }
 754         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
 755
 756         for (i = 0; i < 9; i++)
 757                 {
 758                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^521 - 2^9 */
 759                 }
 760         felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
 761         felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);      /* 2^512 - 3 */
 762 }
 763
 764 /* This is 2^521-1, expressed as an felem */
 765 static const felem kPrime =
 766         {
 767         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 768         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 769         0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 770         };
 771
 772 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 773  * otherwise.
 774  * On entry:
 775  *   in[i] < 2^59 + 2^14
 776  */
 777 static limb felem_is_zero(const felem in)
 778         {
 779         felem ftmp;
 780         limb is_zero, is_p;
 781         felem_assign(ftmp, in);
 782
 783         ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
 784         /* ftmp[8] < 2^57 */
 785         ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
 786         ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
 787         ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
 788         ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
 789         ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
 790         ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
 791         ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
 792         ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
 793         /* ftmp[8] < 2^57 + 4 */
 794
 795         /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
 796          * greater than our bound for ftmp[8]. Therefore we only have to check
 797          * if the zero is zero or 2^521-1. */
 798
 799         is_zero = 0;
 800         is_zero |= ftmp[0];
 801         is_zero |= ftmp[1];
 802         is_zero |= ftmp[2];
 803         is_zero |= ftmp[3];
 804         is_zero |= ftmp[4];
 805         is_zero |= ftmp[5];
 806         is_zero |= ftmp[6];
 807         is_zero |= ftmp[7];
 808         is_zero |= ftmp[8];
 809
 810         is_zero--;
 811         /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
 812          * can be set is if is_zero was 0 before the decrement. */
 813         is_zero = ((s64) is_zero) >> 63;
 814
 815         is_p = ftmp[0] ^ kPrime[0];
 816         is_p |= ftmp[1] ^ kPrime[1];
 817         is_p |= ftmp[2] ^ kPrime[2];
 818         is_p |= ftmp[3] ^ kPrime[3];
 819         is_p |= ftmp[4] ^ kPrime[4];
 820         is_p |= ftmp[5] ^ kPrime[5];
 821         is_p |= ftmp[6] ^ kPrime[6];
 822         is_p |= ftmp[7] ^ kPrime[7];
 823         is_p |= ftmp[8] ^ kPrime[8];
 824
 825         is_p--;
 826         is_p = ((s64) is_p) >> 63;
 827
 828         is_zero |= is_p;
 829         return is_zero;
 830         }
 831
 832 static int felem_is_zero_int(const felem in)
 833         {
 834         return (int) (felem_is_zero(in) & ((limb)1));
 835         }
 836
 837 /* felem_contract converts |in| to its unique, minimal representation.
 838  * On entry:
 839  *   in[i] < 2^59 + 2^14
 840  */
 841 static void felem_contract(felem out, const felem in)
 842         {
 843         limb is_p, is_greater, sign;
 844         static const limb two58 = ((limb)1) << 58;
 845
 846         felem_assign(out, in);
 847
 848         out[0] += out[8] >> 57; out[8] &= bottom57bits;
 849         /* out[8] < 2^57 */
 850         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 851         out[2] += out[1] >> 58; out[1] &= bottom58bits;
 852         out[3] += out[2] >> 58; out[2] &= bottom58bits;
 853         out[4] += out[3] >> 58; out[3] &= bottom58bits;
 854         out[5] += out[4] >> 58; out[4] &= bottom58bits;
 855         out[6] += out[5] >> 58; out[5] &= bottom58bits;
 856         out[7] += out[6] >> 58; out[6] &= bottom58bits;
 857         out[8] += out[7] >> 58; out[7] &= bottom58bits;
 858         /* out[8] < 2^57 + 4 */
 859
 860         /* If the value is greater than 2^521-1 then we have to subtract
 861          * 2^521-1 out. See the comments in felem_is_zero regarding why we
 862          * don't test for other multiples of the prime. */
 863
 864         /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
 865
 866         is_p = out[0] ^ kPrime[0];
 867         is_p |= out[1] ^ kPrime[1];
 868         is_p |= out[2] ^ kPrime[2];
 869         is_p |= out[3] ^ kPrime[3];
 870         is_p |= out[4] ^ kPrime[4];
 871         is_p |= out[5] ^ kPrime[5];
 872         is_p |= out[6] ^ kPrime[6];
 873         is_p |= out[7] ^ kPrime[7];
 874         is_p |= out[8] ^ kPrime[8];
 875
 876         is_p--;
 877         is_p &= is_p << 32;
 878         is_p &= is_p << 16;
 879         is_p &= is_p << 8;
 880         is_p &= is_p << 4;
 881         is_p &= is_p << 2;
 882         is_p &= is_p << 1;
 883         is_p = ((s64) is_p) >> 63;
 884         is_p = ~is_p;
 885
 886         /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 887
 888         out[0] &= is_p;
 889         out[1] &= is_p;
 890         out[2] &= is_p;
 891         out[3] &= is_p;
 892         out[4] &= is_p;
 893         out[5] &= is_p;
 894         out[6] &= is_p;
 895         out[7] &= is_p;
 896         out[8] &= is_p;
 897
 898         /* In order to test that |out| >= 2^521-1 we need only test if out[8]
 899          * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
 900         is_greater = out[8] >> 57;
 901         is_greater |= is_greater << 32;
 902         is_greater |= is_greater << 16;
 903         is_greater |= is_greater << 8;
 904         is_greater |= is_greater << 4;
 905         is_greater |= is_greater << 2;
 906         is_greater |= is_greater << 1;
 907         is_greater = ((s64) is_greater) >> 63;
 908
 909         out[0] -= kPrime[0] & is_greater;
 910         out[1] -= kPrime[1] & is_greater;
 911         out[2] -= kPrime[2] & is_greater;
 912         out[3] -= kPrime[3] & is_greater;
 913         out[4] -= kPrime[4] & is_greater;
 914         out[5] -= kPrime[5] & is_greater;
 915         out[6] -= kPrime[6] & is_greater;
 916         out[7] -= kPrime[7] & is_greater;
 917         out[8] -= kPrime[8] & is_greater;
 918
 919         /* Eliminate negative coefficients */
 920         sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
 921         sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
 922         sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
 923         sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
 924         sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
 925         sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 926         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 927         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 928         sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 929         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 930         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 931         }
 932
 933 /* Group operations
 934  * ----------------
 935  *
 936  * Building on top of the field operations we have the operations on the
 937  * elliptic curve group itself. Points on the curve are represented in Jacobian
 938  * coordinates */
 939
 940 /* point_double calcuates 2*(x_in, y_in, z_in)
 941  *
 942  * The method is taken from:
 943  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
 944  *
 945  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
 946  * while x_out == y_in is not (maybe this works, but it's not tested). */
 947 static void
 948 point_double(felem x_out, felem y_out, felem z_out,
 949              const felem x_in, const felem y_in, const felem z_in)
 950         {
 951         largefelem tmp, tmp2;
 952         felem delta, gamma, beta, alpha, ftmp, ftmp2;
 953
 954         felem_assign(ftmp, x_in);
 955         felem_assign(ftmp2, x_in);
 956
 957         /* delta = z^2 */
 958         felem_square(tmp, z_in);
 959         felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
 960
 961         /* gamma = y^2 */
 962         felem_square(tmp, y_in);
 963         felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
 964
 965         /* beta = x*gamma */
 966         felem_mul(tmp, x_in, gamma);
 967         felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
 968
 969         /* alpha = 3*(x-delta)*(x+delta) */
 970         felem_diff64(ftmp, delta);
 971         /* ftmp[i] < 2^61 */
 972         felem_sum64(ftmp2, delta);
 973         /* ftmp2[i] < 2^60 + 2^15 */
 974         felem_scalar64(ftmp2, 3);
 975         /* ftmp2[i] < 3*2^60 + 3*2^15 */
 976         felem_mul(tmp, ftmp, ftmp2);
 977         /* tmp[i] < 17(3*2^121 + 3*2^76)
 978          *        = 61*2^121 + 61*2^76
 979          *        < 64*2^121 + 64*2^76
 980          *        = 2^127 + 2^82
 981          *        < 2^128 */
 982         felem_reduce(alpha, tmp);
 983
 984         /* x' = alpha^2 - 8*beta */
 985         felem_square(tmp, alpha);
 986         /* tmp[i] < 17*2^120
 987          *        < 2^125 */
 988         felem_assign(ftmp, beta);
 989         felem_scalar64(ftmp, 8);
 990         /* ftmp[i] < 2^62 + 2^17 */
 991         felem_diff_128_64(tmp, ftmp);
 992         /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
 993         felem_reduce(x_out, tmp);
 994
 995         /* z' = (y + z)^2 - gamma - delta */
 996         felem_sum64(delta, gamma);
 997         /* delta[i] < 2^60 + 2^15 */
 998         felem_assign(ftmp, y_in);
 999         felem_sum64(ftmp, z_in);
1000         /* ftmp[i] < 2^60 + 2^15 */
1001         felem_square(tmp, ftmp);
1002         /* tmp[i] < 17(2^122)
1003          *        < 2^127 */
1004         felem_diff_128_64(tmp, delta);
1005         /* tmp[i] < 2^127 + 2^63 */
1006         felem_reduce(z_out, tmp);
1007
1008         /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1009         felem_scalar64(beta, 4);
1010         /* beta[i] < 2^61 + 2^16 */
1011         felem_diff64(beta, x_out);
1012         /* beta[i] < 2^61 + 2^60 + 2^16 */
1013         felem_mul(tmp, alpha, beta);
1014         /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1015          *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1016          *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1017          *        < 2^128 */
1018         felem_square(tmp2, gamma);
1019         /* tmp2[i] < 17*(2^59 + 2^14)^2
1020          *         = 17*(2^118 + 2^74 + 2^28) */
1021         felem_scalar128(tmp2, 8);
1022         /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1023          *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1024          *         < 2^126 */
1025         felem_diff128(tmp, tmp2);
1026         /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1027          *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1028          *          2^74 + 2^69 + 2^34 + 2^30
1029          *        < 2^128 */
1030         felem_reduce(y_out, tmp);
1031         }
1032
1033 /* copy_conditional copies in to out iff mask is all ones. */
1034 static void
1035 copy_conditional(felem out, const felem in, limb mask)
1036         {
1037         unsigned i;
1038         for (i = 0; i < NLIMBS; ++i)
1039                 {
1040                 const limb tmp = mask & (in[i] ^ out[i]);
1041                 out[i] ^= tmp;
1042                 }
1043         }
1044
1045 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1046  *
1047  * The method is taken from
1048  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1049  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1050  *
1051  * This function includes a branch for checking whether the two input points
1052  * are equal (while not equal to the point at infinity). This case never
1053  * happens during single point multiplication, so there is no timing leak for
1054  * ECDH or ECDSA signing. */
1055 static void point_add(felem x3, felem y3, felem z3,
1056         const felem x1, const felem y1, const felem z1,
1057         const int mixed, const felem x2, const felem y2, const felem z2)
1058         {
1059         felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1060         largefelem tmp, tmp2;
1061         limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1062
1063         z1_is_zero = felem_is_zero(z1);
1064         z2_is_zero = felem_is_zero(z2);
1065
1066         /* ftmp = z1z1 = z1**2 */
1067         felem_square(tmp, z1);
1068         felem_reduce(ftmp, tmp);
1069
1070         if (!mixed)
1071                 {
1072                 /* ftmp2 = z2z2 = z2**2 */
1073                 felem_square(tmp, z2);
1074                 felem_reduce(ftmp2, tmp);
1075
1076                 /* u1 = ftmp3 = x1*z2z2 */
1077                 felem_mul(tmp, x1, ftmp2);
1078                 felem_reduce(ftmp3, tmp);
1079
1080                 /* ftmp5 = z1 + z2 */
1081                 felem_assign(ftmp5, z1);
1082                 felem_sum64(ftmp5, z2);
1083                 /* ftmp5[i] < 2^61 */
1084
1085                 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1086                 felem_square(tmp, ftmp5);
1087                 /* tmp[i] < 17*2^122 */
1088                 felem_diff_128_64(tmp, ftmp);
1089                 /* tmp[i] < 17*2^122 + 2^63 */
1090                 felem_diff_128_64(tmp, ftmp2);
1091                 /* tmp[i] < 17*2^122 + 2^64 */
1092                 felem_reduce(ftmp5, tmp);
1093
1094                 /* ftmp2 = z2 * z2z2 */
1095                 felem_mul(tmp, ftmp2, z2);
1096                 felem_reduce(ftmp2, tmp);
1097
1098                 /* s1 = ftmp6 = y1 * z2**3 */
1099                 felem_mul(tmp, y1, ftmp2);
1100                 felem_reduce(ftmp6, tmp);
1101                 }
1102         else
1103                 {
1104                 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1105
1106                 /* u1 = ftmp3 = x1*z2z2 */
1107                 felem_assign(ftmp3, x1);
1108
1109                 /* ftmp5 = 2*z1z2 */
1110                 felem_scalar(ftmp5, z1, 2);
1111
1112                 /* s1 = ftmp6 = y1 * z2**3 */
1113                 felem_assign(ftmp6, y1);
1114                 }
1115
1116         /* u2 = x2*z1z1 */
1117         felem_mul(tmp, x2, ftmp);
1118         /* tmp[i] < 17*2^120 */
1119
1120         /* h = ftmp4 = u2 - u1 */
1121         felem_diff_128_64(tmp, ftmp3);
1122         /* tmp[i] < 17*2^120 + 2^63 */
1123         felem_reduce(ftmp4, tmp);
1124
1125         x_equal = felem_is_zero(ftmp4);
1126
1127         /* z_out = ftmp5 * h */
1128         felem_mul(tmp, ftmp5, ftmp4);
1129         felem_reduce(z_out, tmp);
1130
1131         /* ftmp = z1 * z1z1 */
1132         felem_mul(tmp, ftmp, z1);
1133         felem_reduce(ftmp, tmp);
1134
1135         /* s2 = tmp = y2 * z1**3 */
1136         felem_mul(tmp, y2, ftmp);
1137         /* tmp[i] < 17*2^120 */
1138
1139         /* r = ftmp5 = (s2 - s1)*2 */
1140         felem_diff_128_64(tmp, ftmp6);
1141         /* tmp[i] < 17*2^120 + 2^63 */
1142         felem_reduce(ftmp5, tmp);
1143         y_equal = felem_is_zero(ftmp5);
1144         felem_scalar64(ftmp5, 2);
1145         /* ftmp5[i] < 2^61 */
1146
1147         if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1148                 {
1149                 point_double(x3, y3, z3, x1, y1, z1);
1150                 return;
1151                 }
1152
1153         /* I = ftmp = (2h)**2 */
1154         felem_assign(ftmp, ftmp4);
1155         felem_scalar64(ftmp, 2);
1156         /* ftmp[i] < 2^61 */
1157         felem_square(tmp, ftmp);
1158         /* tmp[i] < 17*2^122 */
1159         felem_reduce(ftmp, tmp);
1160
1161         /* J = ftmp2 = h * I */
1162         felem_mul(tmp, ftmp4, ftmp);
1163         felem_reduce(ftmp2, tmp);
1164
1165         /* V = ftmp4 = U1 * I */
1166         felem_mul(tmp, ftmp3, ftmp);
1167         felem_reduce(ftmp4, tmp);
1168
1169         /* x_out = r**2 - J - 2V */
1170         felem_square(tmp, ftmp5);
1171         /* tmp[i] < 17*2^122 */
1172         felem_diff_128_64(tmp, ftmp2);
1173         /* tmp[i] < 17*2^122 + 2^63 */
1174         felem_assign(ftmp3, ftmp4);
1175         felem_scalar64(ftmp4, 2);
1176         /* ftmp4[i] < 2^61 */
1177         felem_diff_128_64(tmp, ftmp4);
1178         /* tmp[i] < 17*2^122 + 2^64 */
1179         felem_reduce(x_out, tmp);
1180
1181         /* y_out = r(V-x_out) - 2 * s1 * J */
1182         felem_diff64(ftmp3, x_out);
1183         /* ftmp3[i] < 2^60 + 2^60
1184          *          = 2^61 */
1185         felem_mul(tmp, ftmp5, ftmp3);
1186         /* tmp[i] < 17*2^122 */
1187         felem_mul(tmp2, ftmp6, ftmp2);
1188         /* tmp2[i] < 17*2^120 */
1189         felem_scalar128(tmp2, 2);
1190         /* tmp2[i] < 17*2^121 */
1191         felem_diff128(tmp, tmp2);
1192         /* tmp[i] < 2^127 - 2^69 + 17*2^122
1193          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1194          *        < 2^127 */
1195         felem_reduce(y_out, tmp);
1196
1197         copy_conditional(x_out, x2, z1_is_zero);
1198         copy_conditional(x_out, x1, z2_is_zero);
1199         copy_conditional(y_out, y2, z1_is_zero);
1200         copy_conditional(y_out, y1, z2_is_zero);
1201         copy_conditional(z_out, z2, z1_is_zero);
1202         copy_conditional(z_out, z1, z2_is_zero);
1203         felem_assign(x3, x_out);
1204         felem_assign(y3, y_out);
1205         felem_assign(z3, z_out);
1206         }
1207
1208 /* Base point pre computation
1209  * --------------------------
1210  *
1211  * Two different sorts of precomputed tables are used in the following code.
1212  * Each contain various points on the curve, where each point is three field
1213  * elements (x, y, z).
1214  *
1215  * For the base point table, z is usually 1 (0 for the point at infinity).
1216  * This table has 16 elements:
1217  * index | bits    | point
1218  * ------+---------+------------------------------
1219  *     0 | 0 0 0 0 | 0G
1220  *     1 | 0 0 0 1 | 1G
1221  *     2 | 0 0 1 0 | 2^130G
1222  *     3 | 0 0 1 1 | (2^130 + 1)G
1223  *     4 | 0 1 0 0 | 2^260G
1224  *     5 | 0 1 0 1 | (2^260 + 1)G
1225  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1226  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1227  *     8 | 1 0 0 0 | 2^390G
1228  *     9 | 1 0 0 1 | (2^390 + 1)G
1229  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1230  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1231  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1232  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1233  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1234  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1235  *
1236  * The reason for this is so that we can clock bits into four different
1237  * locations when doing simple scalar multiplies against the base point.
1238  *
1239  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1240
1241 /* gmul is the table of precomputed base points */
1242 static const felem gmul[16][3] =
1243         {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1244           {0, 0, 0, 0, 0, 0, 0, 0, 0},
1245           {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1246          {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1247            0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1248            0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1249           {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1250            0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1251            0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1252           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1253          {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1254            0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1255            0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1256           {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1257            0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1258            0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1259           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1260          {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1261            0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1262            0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1263           {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1264            0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1265            0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1266           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1267          {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1268            0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1269            0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1270           {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1271            0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1272            0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1273           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1274          {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1275            0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1276            0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1277           {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1278            0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1279            0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1280           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1281          {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1282            0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1283            0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1284           {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1285            0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1286            0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1287           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1288          {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1289            0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1290            0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1291           {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1292            0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1293            0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1294           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1295          {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1296            0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1297            0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1298           {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1299            0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1300            0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1301           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1302          {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1303            0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1304            0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1305           {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1306            0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1307            0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1308           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1309          {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1310            0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1311            0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1312           {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1313            0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1314            0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1315           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1316          {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1317            0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1318            0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1319           {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1320            0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1321            0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1322           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1323          {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1324            0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1325            0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1326           {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1327            0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1328            0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1329           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1330          {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1331            0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1332            0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1333           {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1334            0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1335            0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1336           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1337          {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1338            0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1339            0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1340           {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1341            0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1342            0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1343           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1344          {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1345            0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1346            0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1347           {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1348            0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1349            0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1350          {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1351
1352 /* select_point selects the |idx|th point from a precomputation table and
1353  * copies it to out. */
1354 static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1355                          felem out[3])
1356         {
1357         unsigned i, j;
1358         limb *outlimbs = &out[0][0];
1359         memset(outlimbs, 0, 3 * sizeof(felem));
1360
1361         for (i = 0; i < size; i++)
1362                 {
1363                 const limb *inlimbs = &pre_comp[i][0][0];
1364                 limb mask = i ^ idx;
1365                 mask |= mask >> 4;
1366                 mask |= mask >> 2;
1367                 mask |= mask >> 1;
1368                 mask &= 1;
1369                 mask--;
1370                 for (j = 0; j < NLIMBS * 3; j++)
1371                         outlimbs[j] |= inlimbs[j] & mask;
1372                 }
1373         }
1374
1375 /* get_bit returns the |i|th bit in |in| */
1376 static char get_bit(const felem_bytearray in, int i)
1377         {
1378         if (i < 0)
1379                 return 0;
1380         return (in[i >> 3] >> (i & 7)) & 1;
1381         }
1382
1383 /* Interleaved point multiplication using precomputed point multiples:
1384  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1385  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1386  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1387  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1388 static void batch_mul(felem x_out, felem y_out, felem z_out,
1389         const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1390         const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1391         {
1392         int i, skip;
1393         unsigned num, gen_mul = (g_scalar != NULL);
1394         felem nq[3], tmp[4];
1395         limb bits;
1396         u8 sign, digit;
1397
1398         /* set nq to the point at infinity */
1399         memset(nq, 0, 3 * sizeof(felem));
1400
1401         /* Loop over all scalars msb-to-lsb, interleaving additions
1402          * of multiples of the generator (last quarter of rounds)
1403          * and additions of other points multiples (every 5th round).
1404          */
1405         skip = 1; /* save two point operations in the first round */
1406         for (i = (num_points ? 520 : 130); i >= 0; --i)
1407                 {
1408                 /* double */
1409                 if (!skip)
1410                         point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1411
1412                 /* add multiples of the generator */
1413                 if (gen_mul && (i <= 130))
1414                         {
1415                         bits = get_bit(g_scalar, i + 390) << 3;
1416                         if (i < 130)
1417                                 {
1418                                 bits |= get_bit(g_scalar, i + 260) << 2;
1419                                 bits |= get_bit(g_scalar, i + 130) << 1;
1420                                 bits |= get_bit(g_scalar, i);
1421                                 }
1422                         /* select the point to add, in constant time */
1423                         select_point(bits, 16, g_pre_comp, tmp);
1424                         if (!skip)
1425                                 {
1426                                 point_add(nq[0], nq[1], nq[2],
1427                                         nq[0], nq[1], nq[2],
1428                                         1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1429                                 }
1430                         else
1431                                 {
1432                                 memcpy(nq, tmp, 3 * sizeof(felem));
1433                                 skip = 0;
1434                                 }
1435                         }
1436
1437                 /* do other additions every 5 doublings */
1438                 if (num_points && (i % 5 == 0))
1439                         {
1440                         /* loop over all scalars */
1441                         for (num = 0; num < num_points; ++num)
1442                                 {
1443                                 bits = get_bit(scalars[num], i + 4) << 5;
1444                                 bits |= get_bit(scalars[num], i + 3) << 4;
1445                                 bits |= get_bit(scalars[num], i + 2) << 3;
1446                                 bits |= get_bit(scalars[num], i + 1) << 2;
1447                                 bits |= get_bit(scalars[num], i) << 1;
1448                                 bits |= get_bit(scalars[num], i - 1);
1449                                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1450
1451                                 /* select the point to add or subtract, in constant time */
1452                                 select_point(digit, 17, pre_comp[num], tmp);
1453                                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1454                                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1455
1456                                 if (!skip)
1457                                         {
1458                                         point_add(nq[0], nq[1], nq[2],
1459                                                 nq[0], nq[1], nq[2],
1460                                                 mixed, tmp[0], tmp[1], tmp[2]);
1461                                         }
1462                                 else
1463                                         {
1464                                         memcpy(nq, tmp, 3 * sizeof(felem));
1465                                         skip = 0;
1466                                         }
1467                                 }
1468                         }
1469                 }
1470         felem_assign(x_out, nq[0]);
1471         felem_assign(y_out, nq[1]);
1472         felem_assign(z_out, nq[2]);
1473         }
1474
1475
1476 /* Precomputation for the group generator. */
1477 typedef struct {
1478         felem g_pre_comp[16][3];
1479         int references;
1480 } NISTP521_PRE_COMP;
1481
1482 const EC_METHOD *EC_GFp_nistp521_method(void)
1483         {
1484         static const EC_METHOD ret = {
1485                 EC_FLAGS_DEFAULT_OCT,
1486                 NID_X9_62_prime_field,
1487                 ec_GFp_nistp521_group_init,
1488                 ec_GFp_simple_group_finish,
1489                 ec_GFp_simple_group_clear_finish,
1490                 ec_GFp_nist_group_copy,
1491                 ec_GFp_nistp521_group_set_curve,
1492                 ec_GFp_simple_group_get_curve,
1493                 ec_GFp_simple_group_get_degree,
1494                 ec_GFp_simple_group_check_discriminant,
1495                 ec_GFp_simple_point_init,
1496                 ec_GFp_simple_point_finish,
1497                 ec_GFp_simple_point_clear_finish,
1498                 ec_GFp_simple_point_copy,
1499                 ec_GFp_simple_point_set_to_infinity,
1500                 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1501                 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1502                 ec_GFp_simple_point_set_affine_coordinates,
1503                 ec_GFp_nistp521_point_get_affine_coordinates,
1504                 0 /* point_set_compressed_coordinates */,
1505                 0 /* point2oct */,
1506                 0 /* oct2point */,
1507                 ec_GFp_simple_add,
1508                 ec_GFp_simple_dbl,
1509                 ec_GFp_simple_invert,
1510                 ec_GFp_simple_is_at_infinity,
1511                 ec_GFp_simple_is_on_curve,
1512                 ec_GFp_simple_cmp,
1513                 ec_GFp_simple_make_affine,
1514                 ec_GFp_simple_points_make_affine,
1515                 ec_GFp_nistp521_points_mul,
1516                 ec_GFp_nistp521_precompute_mult,
1517                 ec_GFp_nistp521_have_precompute_mult,
1518                 ec_GFp_nist_field_mul,
1519                 ec_GFp_nist_field_sqr,
1520                 0 /* field_div */,
1521                 0 /* field_encode */,
1522                 0 /* field_decode */,
1523                 0 /* field_set_to_one */ };
1524
1525         return &ret;
1526         }
1527
1528
1529 /******************************************************************************/
1530 /*                     FUNCTIONS TO MANAGE PRECOMPUTATION
1531  */
1532
1533 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1534         {
1535         NISTP521_PRE_COMP *ret = NULL;
1536         ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1537         if (!ret)
1538                 {
1539                 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1540                 return ret;
1541                 }
1542         memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1543         ret->references = 1;
1544         return ret;
1545         }
1546
1547 static void *nistp521_pre_comp_dup(void *src_)
1548         {
1549         NISTP521_PRE_COMP *src = src_;
1550
1551         /* no need to actually copy, these objects never change! */
1552         CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1553
1554         return src_;
1555         }
1556
1557 static void nistp521_pre_comp_free(void *pre_)
1558         {
1559         int i;
1560         NISTP521_PRE_COMP *pre = pre_;
1561
1562         if (!pre)
1563                 return;
1564
1565         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1566         if (i > 0)
1567                 return;
1568
1569         OPENSSL_free(pre);
1570         }
1571
1572 static void nistp521_pre_comp_clear_free(void *pre_)
1573         {
1574         int i;
1575         NISTP521_PRE_COMP *pre = pre_;
1576
1577         if (!pre)
1578                 return;
1579
1580         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1581         if (i > 0)
1582                 return;
1583
1584         OPENSSL_cleanse(pre, sizeof(*pre));
1585         OPENSSL_free(pre);
1586         }
1587
1588 /******************************************************************************/
1589 /*                         OPENSSL EC_METHOD FUNCTIONS
1590  */
1591
1592 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1593         {
1594         int ret;
1595         ret = ec_GFp_simple_group_init(group);
1596         group->a_is_minus3 = 1;
1597         return ret;
1598         }
1599
1600 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1601         const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1602         {
1603         int ret = 0;
1604         BN_CTX *new_ctx = NULL;
1605         BIGNUM *curve_p, *curve_a, *curve_b;
1606
1607         if (ctx == NULL)
1608                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1609         BN_CTX_start(ctx);
1610         if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1611                 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1612                 ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1613         BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1614         BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1615         BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1616         if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1617                 (BN_cmp(curve_b, b)))
1618                 {
1619                 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1620                         EC_R_WRONG_CURVE_PARAMETERS);
1621                 goto err;
1622                 }
1623         group->field_mod_func = BN_nist_mod_521;
1624         ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1625 err:
1626         BN_CTX_end(ctx);
1627         if (new_ctx != NULL)
1628                 BN_CTX_free(new_ctx);
1629         return ret;
1630         }
1631
1632 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1633  * (X', Y') = (X/Z^2, Y/Z^3) */
1634 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1635         const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1636         {
1637         felem z1, z2, x_in, y_in, x_out, y_out;
1638         largefelem tmp;
1639
1640         if (EC_POINT_is_at_infinity(group, point))
1641                 {
1642                 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1643                         EC_R_POINT_AT_INFINITY);
1644                 return 0;
1645                 }
1646         if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1647                 (!BN_to_felem(z1, &point->Z))) return 0;
1648         felem_inv(z2, z1);
1649         felem_square(tmp, z2); felem_reduce(z1, tmp);
1650         felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1651         felem_contract(x_out, x_in);
1652         if (x != NULL)
1653                 {
1654                 if (!felem_to_BN(x, x_out))
1655                         {
1656                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1657                         return 0;
1658                         }
1659                 }
1660         felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1661         felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1662         felem_contract(y_out, y_in);
1663         if (y != NULL)
1664                 {
1665                 if (!felem_to_BN(y, y_out))
1666                         {
1667                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1668                         return 0;
1669                         }
1670                 }
1671         return 1;
1672         }
1673
1674 static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1675         {
1676         /* Runs in constant time, unless an input is the point at infinity
1677          * (which normally shouldn't happen). */
1678         ec_GFp_nistp_points_make_affine_internal(
1679                 num,
1680                 points,
1681                 sizeof(felem),
1682                 tmp_felems,
1683                 (void (*)(void *)) felem_one,
1684                 (int (*)(const void *)) felem_is_zero_int,
1685                 (void (*)(void *, const void *)) felem_assign,
1686                 (void (*)(void *, const void *)) felem_square_reduce,
1687                 (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1688                 (void (*)(void *, const void *)) felem_inv,
1689                 (void (*)(void *, const void *)) felem_contract);
1690         }
1691
1692 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1693  * Result is stored in r (r can equal one of the inputs). */
1694 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1695         const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1696         const BIGNUM *scalars[], BN_CTX *ctx)
1697         {
1698         int ret = 0;
1699         int j;
1700         int mixed = 0;
1701         BN_CTX *new_ctx = NULL;
1702         BIGNUM *x, *y, *z, *tmp_scalar;
1703         felem_bytearray g_secret;
1704         felem_bytearray *secrets = NULL;
1705         felem (*pre_comp)[17][3] = NULL;
1706         felem *tmp_felems = NULL;
1707         felem_bytearray tmp;
1708         unsigned i, num_bytes;
1709         int have_pre_comp = 0;
1710         size_t num_points = num;
1711         felem x_in, y_in, z_in, x_out, y_out, z_out;
1712         NISTP521_PRE_COMP *pre = NULL;
1713         felem (*g_pre_comp)[3] = NULL;
1714         EC_POINT *generator = NULL;
1715         const EC_POINT *p = NULL;
1716         const BIGNUM *p_scalar = NULL;
1717
1718         if (ctx == NULL)
1719                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1720         BN_CTX_start(ctx);
1721         if (((x = BN_CTX_get(ctx)) == NULL) ||
1722                 ((y = BN_CTX_get(ctx)) == NULL) ||
1723                 ((z = BN_CTX_get(ctx)) == NULL) ||
1724                 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1725                 goto err;
1726
1727         if (scalar != NULL)
1728                 {
1729                 pre = EC_EX_DATA_get_data(group->extra_data,
1730                         nistp521_pre_comp_dup, nistp521_pre_comp_free,
1731                         nistp521_pre_comp_clear_free);
1732                 if (pre)
1733                         /* we have precomputation, try to use it */
1734                         g_pre_comp = &pre->g_pre_comp[0];
1735                 else
1736                         /* try to use the standard precomputation */
1737                         g_pre_comp = (felem (*)[3]) gmul;
1738                 generator = EC_POINT_new(group);
1739                 if (generator == NULL)
1740                         goto err;
1741                 /* get the generator from precomputation */
1742                 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1743                         !felem_to_BN(y, g_pre_comp[1][1]) ||
1744                         !felem_to_BN(z, g_pre_comp[1][2]))
1745                         {
1746                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1747                         goto err;
1748                         }
1749                 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1750                                 generator, x, y, z, ctx))
1751                         goto err;
1752                 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1753                         /* precomputation matches generator */
1754                         have_pre_comp = 1;
1755                 else
1756                         /* we don't have valid precomputation:
1757                          * treat the generator as a random point */
1758                         num_points++;
1759                 }
1760
1761         if (num_points > 0)
1762                 {
1763                 if (num_points >= 2)
1764                         {
1765                         /* unless we precompute multiples for just one point,
1766                          * converting those into affine form is time well spent  */
1767                         mixed = 1;
1768                         }
1769                 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1770                 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1771                 if (mixed)
1772                         tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1773                 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1774                         {
1775                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1776                         goto err;
1777                         }
1778
1779                 /* we treat NULL scalars as 0, and NULL points as points at infinity,
1780                  * i.e., they contribute nothing to the linear combination */
1781                 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1782                 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1783                 for (i = 0; i < num_points; ++i)
1784                         {
1785                         if (i == num)
1786                                 /* we didn't have a valid precomputation, so we pick
1787                                  * the generator */
1788                                 {
1789                                 p = EC_GROUP_get0_generator(group);
1790                                 p_scalar = scalar;
1791                                 }
1792                         else
1793                                 /* the i^th point */
1794                                 {
1795                                 p = points[i];
1796                                 p_scalar = scalars[i];
1797                                 }
1798                         if ((p_scalar != NULL) && (p != NULL))
1799                                 {
1800                                 /* reduce scalar to 0 <= scalar < 2^521 */
1801                                 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1802                                         {
1803                                         /* this is an unusual input, and we don't guarantee
1804                                          * constant-timeness */
1805                                         if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1806                                                 {
1807                                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1808                                                 goto err;
1809                                                 }
1810                                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1811                                         }
1812                                 else
1813                                         num_bytes = BN_bn2bin(p_scalar, tmp);
1814                                 flip_endian(secrets[i], tmp, num_bytes);
1815                                 /* precompute multiples */
1816                                 if ((!BN_to_felem(x_out, &p->X)) ||
1817                                         (!BN_to_felem(y_out, &p->Y)) ||
1818                                         (!BN_to_felem(z_out, &p->Z))) goto err;
1819                                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1820                                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1821                                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1822                                 for (j = 2; j <= 16; ++j)
1823                                         {
1824                                         if (j & 1)
1825                                                 {
1826                                                 point_add(
1827                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1828                                                         pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1829                                                         0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1830                                                 }
1831                                         else
1832                                                 {
1833                                                 point_double(
1834                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835                                                         pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1836                                                 }
1837                                         }
1838                                 }
1839                         }
1840                 if (mixed)
1841                         make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842                 }
1843
1844         /* the scalar for the generator */
1845         if ((scalar != NULL) && (have_pre_comp))
1846                 {
1847                 memset(g_secret, 0, sizeof(g_secret));
1848                 /* reduce scalar to 0 <= scalar < 2^521 */
1849                 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1850                         {
1851                         /* this is an unusual input, and we don't guarantee
1852                          * constant-timeness */
1853                         if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1854                                 {
1855                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1856                                 goto err;
1857                                 }
1858                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1859                         }
1860                 else
1861                         num_bytes = BN_bn2bin(scalar, tmp);
1862                 flip_endian(g_secret, tmp, num_bytes);
1863                 /* do the multiplication with generator precomputation*/
1864                 batch_mul(x_out, y_out, z_out,
1865                         (const felem_bytearray (*)) secrets, num_points,
1866                         g_secret,
1867                         mixed, (const felem (*)[17][3]) pre_comp,
1868                         (const felem (*)[3]) g_pre_comp);
1869                 }
1870         else
1871                 /* do the multiplication without generator precomputation */
1872                 batch_mul(x_out, y_out, z_out,
1873                         (const felem_bytearray (*)) secrets, num_points,
1874                         NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1875         /* reduce the output to its unique minimal representation */
1876         felem_contract(x_in, x_out);
1877         felem_contract(y_in, y_out);
1878         felem_contract(z_in, z_out);
1879         if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1880                 (!felem_to_BN(z, z_in)))
1881                 {
1882                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1883                 goto err;
1884                 }
1885         ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1886
1887 err:
1888         BN_CTX_end(ctx);
1889         if (generator != NULL)
1890                 EC_POINT_free(generator);
1891         if (new_ctx != NULL)
1892                 BN_CTX_free(new_ctx);
1893         if (secrets != NULL)
1894                 OPENSSL_free(secrets);
1895         if (pre_comp != NULL)
1896                 OPENSSL_free(pre_comp);
1897         if (tmp_felems != NULL)
1898                 OPENSSL_free(tmp_felems);
1899         return ret;
1900         }
1901
1902 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1903         {
1904         int ret = 0;
1905         NISTP521_PRE_COMP *pre = NULL;
1906         int i, j;
1907         BN_CTX *new_ctx = NULL;
1908         BIGNUM *x, *y;
1909         EC_POINT *generator = NULL;
1910         felem tmp_felems[16];
1911
1912         /* throw away old precomputation */
1913         EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1914                 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1915         if (ctx == NULL)
1916                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1917         BN_CTX_start(ctx);
1918         if (((x = BN_CTX_get(ctx)) == NULL) ||
1919                 ((y = BN_CTX_get(ctx)) == NULL))
1920                 goto err;
1921         /* get the generator */
1922         if (group->generator == NULL) goto err;
1923         generator = EC_POINT_new(group);
1924         if (generator == NULL)
1925                 goto err;
1926         BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1927         BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1928         if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1929                 goto err;
1930         if ((pre = nistp521_pre_comp_new()) == NULL)
1931                 goto err;
1932         /* if the generator is the standard one, use built-in precomputation */
1933         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1934                 {
1935                 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1936                 ret = 1;
1937                 goto err;
1938                 }
1939         if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1940                 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1941                 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1942                 goto err;
1943         /* compute 2^130*G, 2^260*G, 2^390*G */
1944         for (i = 1; i <= 4; i <<= 1)
1945                 {
1946                 point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1947                         pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1948                         pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1949                 for (j = 0; j < 129; ++j)
1950                         {
1951                         point_double(pre->g_pre_comp[2*i][0],
1952                                 pre->g_pre_comp[2*i][1],
1953                                 pre->g_pre_comp[2*i][2],
1954                                 pre->g_pre_comp[2*i][0],
1955                                 pre->g_pre_comp[2*i][1],
1956                                 pre->g_pre_comp[2*i][2]);
1957                         }
1958                 }
1959         /* g_pre_comp[0] is the point at infinity */
1960         memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1961         /* the remaining multiples */
1962         /* 2^130*G + 2^260*G */
1963         point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1964                 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1965                 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1966                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1967                 pre->g_pre_comp[2][2]);
1968         /* 2^130*G + 2^390*G */
1969         point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1970                 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1971                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1972                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1973                 pre->g_pre_comp[2][2]);
1974         /* 2^260*G + 2^390*G */
1975         point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1976                 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1977                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1978                 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1979                 pre->g_pre_comp[4][2]);
1980         /* 2^130*G + 2^260*G + 2^390*G */
1981         point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1982                 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1983                 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1984                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1985                 pre->g_pre_comp[2][2]);
1986         for (i = 1; i < 8; ++i)
1987                 {
1988                 /* odd multiples: add G */
1989                 point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1990                         pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1991                         pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1992                         0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1993                         pre->g_pre_comp[1][2]);
1994                 }
1995         make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1996
1997         if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1998                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1999                 goto err;
2000         ret = 1;
2001         pre = NULL;
2002  err:
2003         BN_CTX_end(ctx);
2004         if (generator != NULL)
2005                 EC_POINT_free(generator);
2006         if (new_ctx != NULL)
2007                 BN_CTX_free(new_ctx);
2008         if (pre)
2009                 nistp521_pre_comp_free(pre);
2010         return ret;
2011         }
2012
2013 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2014         {
2015         if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2016                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2017                 != NULL)
2018                 return 1;
2019         else
2020                 return 0;
2021         }
2022
2023 #else
2024 static void *dummy=&dummy;
2025 #endif