src/libstrongswan/plugins/curve25519/ref10/ref10.c

   1 /*
   2  * Copyright (C) 2016 Andreas Steffen
   3  *
   4  * Based on the public domain libsodium adaptation by Frank Denis
   5  * of the SUPERCOP ref10 implementation by  Daniel J. Bernstein,
   6  * Niels Duif, Peter Schwabe, Tanja Lange and Bo-Yin Yang.
   7  */
   8
   9 #include <stddef.h>
  10 #include <stdint.h>
  11 #include <string.h>
  12
  13 #include "ref10.h"
  14
  15 #include <utils/utils.h>
  16
  17 static uint64_t load_3(const uint8_t *in)
  18 {
  19         uint64_t result;
  20
  21         result = (uint64_t) in[0];
  22         result |= ((uint64_t) in[1]) << 8;
  23         result |= ((uint64_t) in[2]) << 16;
  24
  25         return result;
  26 }
  27
  28 static uint64_t load_4(const uint8_t *in)
  29 {
  30         uint64_t result;
  31
  32         result = (uint64_t) in[0];
  33         result |= ((uint64_t) in[1]) << 8;
  34         result |= ((uint64_t) in[2]) << 16;
  35         result |= ((uint64_t) in[3]) << 24;
  36
  37         return result;
  38 }
  39
  40 /**
  41  * h = 0
  42  */
  43 static void fe_0(fe h)
  44 {
  45         memset(&h[0], 0, 10 * sizeof h[0]);
  46 }
  47
  48 /**
  49  * h = 1
  50  */
  51 static void fe_1(fe h)
  52 {
  53         h[0] = 1;
  54         h[1] = 0;
  55         memset(&h[2], 0, 8 * sizeof h[0]);
  56 }
  57
  58 /**
  59  * h = f + g
  60  * Can overlap h with f or g.
  61  *
  62  * Preconditions:
  63  * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
  64  * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
  65  *
  66  * Postconditions:
  67  * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
  68  */
  69 static void fe_add(fe h, const fe f, const fe g)
  70 {
  71         int32_t f0 = f[0];
  72         int32_t f1 = f[1];
  73         int32_t f2 = f[2];
  74         int32_t f3 = f[3];
  75         int32_t f4 = f[4];
  76         int32_t f5 = f[5];
  77         int32_t f6 = f[6];
  78         int32_t f7 = f[7];
  79         int32_t f8 = f[8];
  80         int32_t f9 = f[9];
  81
  82         int32_t g0 = g[0];
  83         int32_t g1 = g[1];
  84         int32_t g2 = g[2];
  85         int32_t g3 = g[3];
  86         int32_t g4 = g[4];
  87         int32_t g5 = g[5];
  88         int32_t g6 = g[6];
  89         int32_t g7 = g[7];
  90         int32_t g8 = g[8];
  91         int32_t g9 = g[9];
  92
  93         int32_t h0 = f0 + g0;
  94         int32_t h1 = f1 + g1;
  95         int32_t h2 = f2 + g2;
  96         int32_t h3 = f3 + g3;
  97         int32_t h4 = f4 + g4;
  98         int32_t h5 = f5 + g5;
  99         int32_t h6 = f6 + g6;
 100         int32_t h7 = f7 + g7;
 101         int32_t h8 = f8 + g8;
 102         int32_t h9 = f9 + g9;
 103
 104         h[0] = h0;
 105         h[1] = h1;
 106         h[2] = h2;
 107         h[3] = h3;
 108         h[4] = h4;
 109         h[5] = h5;
 110         h[6] = h6;
 111         h[7] = h7;
 112         h[8] = h8;
 113         h[9] = h9;
 114 }
 115
 116 /**
 117  * Replace (f,g) with (g,g) if b == 1;
 118  * replace (f,g) with (f,g) if b == 0.
 119  *
 120  * Preconditions: b in {0,1}.
 121  */
 122 static void fe_cmov(fe f, const fe g, unsigned int b)
 123 {
 124         int32_t f0 = f[0];
 125         int32_t f1 = f[1];
 126         int32_t f2 = f[2];
 127         int32_t f3 = f[3];
 128         int32_t f4 = f[4];
 129         int32_t f5 = f[5];
 130         int32_t f6 = f[6];
 131         int32_t f7 = f[7];
 132         int32_t f8 = f[8];
 133         int32_t f9 = f[9];
 134
 135         int32_t g0 = g[0];
 136         int32_t g1 = g[1];
 137         int32_t g2 = g[2];
 138         int32_t g3 = g[3];
 139         int32_t g4 = g[4];
 140         int32_t g5 = g[5];
 141         int32_t g6 = g[6];
 142         int32_t g7 = g[7];
 143         int32_t g8 = g[8];
 144         int32_t g9 = g[9];
 145
 146         int32_t x0 = f0 ^ g0;
 147         int32_t x1 = f1 ^ g1;
 148         int32_t x2 = f2 ^ g2;
 149         int32_t x3 = f3 ^ g3;
 150         int32_t x4 = f4 ^ g4;
 151         int32_t x5 = f5 ^ g5;
 152         int32_t x6 = f6 ^ g6;
 153         int32_t x7 = f7 ^ g7;
 154         int32_t x8 = f8 ^ g8;
 155         int32_t x9 = f9 ^ g9;
 156
 157         b = (unsigned int) (- (int) b);
 158
 159         x0 &= b;
 160         x1 &= b;
 161         x2 &= b;
 162         x3 &= b;
 163         x4 &= b;
 164         x5 &= b;
 165         x6 &= b;
 166         x7 &= b;
 167         x8 &= b;
 168         x9 &= b;
 169
 170         f[0] = f0 ^ x0;
 171         f[1] = f1 ^ x1;
 172         f[2] = f2 ^ x2;
 173         f[3] = f3 ^ x3;
 174         f[4] = f4 ^ x4;
 175         f[5] = f5 ^ x5;
 176         f[6] = f6 ^ x6;
 177         f[7] = f7 ^ x7;
 178         f[8] = f8 ^ x8;
 179         f[9] = f9 ^ x9;
 180 }
 181
 182 /**
 183  * h = f
 184  */
 185 static void fe_copy(fe h, const fe f)
 186 {
 187         int32_t f0 = f[0];
 188         int32_t f1 = f[1];
 189         int32_t f2 = f[2];
 190         int32_t f3 = f[3];
 191         int32_t f4 = f[4];
 192         int32_t f5 = f[5];
 193         int32_t f6 = f[6];
 194         int32_t f7 = f[7];
 195         int32_t f8 = f[8];
 196         int32_t f9 = f[9];
 197
 198         h[0] = f0;
 199         h[1] = f1;
 200         h[2] = f2;
 201         h[3] = f3;
 202         h[4] = f4;
 203         h[5] = f5;
 204         h[6] = f6;
 205         h[7] = f7;
 206         h[8] = f8;
 207         h[9] = f9;
 208 }
 209
 210 /**
 211  * Ignores top bit of h.
 212  */
 213 static void fe_frombytes(fe h, const uint8_t *s)
 214 {
 215         int64_t h0 = load_4(s);
 216         int64_t h1 = load_3(s + 4) << 6;
 217         int64_t h2 = load_3(s + 7) << 5;
 218         int64_t h3 = load_3(s + 10) << 3;
 219         int64_t h4 = load_3(s + 13) << 2;
 220         int64_t h5 = load_4(s + 16);
 221         int64_t h6 = load_3(s + 20) << 7;
 222         int64_t h7 = load_3(s + 23) << 5;
 223         int64_t h8 = load_3(s + 26) << 4;
 224         int64_t h9 = (load_3(s + 29) & 8388607) << 2;
 225
 226         int64_t carry0, carry1, carry2, carry3, carry4;
 227         int64_t carry5, carry6, carry7, carry8, carry9;
 228
 229         carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
 230         h0 += carry9 * 19;
 231         h9 -= carry9 * ((uint64_t) 1L << 25);
 232
 233         carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
 234         h2 += carry1;
 235         h1 -= carry1 * ((uint64_t) 1L << 25);
 236
 237         carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
 238         h4 += carry3;
 239         h3 -= carry3 * ((uint64_t) 1L << 25);
 240
 241         carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
 242         h6 += carry5;
 243         h5 -= carry5 * ((uint64_t) 1L << 25);
 244
 245         carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
 246         h8 += carry7;
 247         h7 -= carry7 * ((uint64_t) 1L << 25);
 248
 249         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
 250         h1 += carry0;
 251         h0 -= carry0 * ((uint64_t) 1L << 26);
 252
 253         carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
 254         h3 += carry2;
 255         h2 -= carry2 * ((uint64_t) 1L << 26);
 256
 257         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
 258         h5 += carry4;
 259         h4 -= carry4 * ((uint64_t) 1L << 26);
 260
 261         carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
 262         h7 += carry6;
 263         h6 -= carry6 * ((uint64_t) 1L << 26);
 264
 265         carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
 266         h9 += carry8;
 267         h8 -= carry8 * ((uint64_t) 1L << 26);
 268
 269         h[0] = (int32_t) h0;
 270         h[1] = (int32_t) h1;
 271         h[2] = (int32_t) h2;
 272         h[3] = (int32_t) h3;
 273         h[4] = (int32_t) h4;
 274         h[5] = (int32_t) h5;
 275         h[6] = (int32_t) h6;
 276         h[7] = (int32_t) h7;
 277         h[8] = (int32_t) h8;
 278         h[9] = (int32_t) h9;
 279 }
 280
 281 /**
 282  * Preconditions:
 283  * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
 284  *
 285  * Write p=2^255-19; q=floor(h/p).
 286  * Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
 287  *
 288  * Proof:
 289  * Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
 290  * Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
 291  *
 292  * Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
 293  * Then 0<y<1.
 294  *
 295  * Write r=h-pq.
 296  * Have 0<=r<=p-1=2^255-20.
 297  * Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
 298  *
 299  * Write x=r+19(2^-255)r+y.
 300  * Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
 301  *
 302  * Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
 303  * so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
 304  */
 305 static void fe_tobytes(uint8_t *s, const fe h)
 306 {
 307         int32_t h0 = h[0];
 308         int32_t h1 = h[1];
 309         int32_t h2 = h[2];
 310         int32_t h3 = h[3];
 311         int32_t h4 = h[4];
 312         int32_t h5 = h[5];
 313         int32_t h6 = h[6];
 314         int32_t h7 = h[7];
 315         int32_t h8 = h[8];
 316         int32_t h9 = h[9];
 317
 318         int32_t carry0, carry1, carry2, carry3, carry4;
 319         int32_t carry5, carry6, carry7, carry8, carry9;
 320         int32_t q;
 321
 322         q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
 323         q = (h0 + q) >> 26;
 324         q = (h1 + q) >> 25;
 325         q = (h2 + q) >> 26;
 326         q = (h3 + q) >> 25;
 327         q = (h4 + q) >> 26;
 328         q = (h5 + q) >> 25;
 329         q = (h6 + q) >> 26;
 330         q = (h7 + q) >> 25;
 331         q = (h8 + q) >> 26;
 332         q = (h9 + q) >> 25;
 333
 334         /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
 335         h0 += 19 * q;
 336         /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
 337
 338         carry0 = h0 >> 26;
 339         h1 += carry0;
 340         h0 -= carry0 * ((uint32_t) 1L << 26);
 341
 342         carry1 = h1 >> 25;
 343         h2 += carry1;
 344         h1 -= carry1 * ((uint32_t) 1L << 25);
 345
 346         carry2 = h2 >> 26;
 347         h3 += carry2;
 348         h2 -= carry2 * ((uint32_t) 1L << 26);
 349
 350         carry3 = h3 >> 25;
 351         h4 += carry3;
 352         h3 -= carry3 * ((uint32_t) 1L << 25);
 353
 354         carry4 = h4 >> 26;
 355         h5 += carry4;
 356         h4 -= carry4 * ((uint32_t) 1L << 26);
 357
 358         carry5 = h5 >> 25;
 359         h6 += carry5;
 360         h5 -= carry5 * ((uint32_t) 1L << 25);
 361
 362         carry6 = h6 >> 26;
 363         h7 += carry6;
 364         h6 -= carry6 * ((uint32_t) 1L << 26);
 365
 366         carry7 = h7 >> 25;
 367         h8 += carry7;
 368         h7 -= carry7 * ((uint32_t) 1L << 25);
 369
 370         carry8 = h8 >> 26;
 371         h9 += carry8;
 372         h8 -= carry8 * ((uint32_t) 1L << 26);
 373
 374         carry9 = h9 >> 25;
 375         h9 -= carry9 * ((uint32_t) 1L << 25);
 376         /* h10 = carry9 */
 377
 378         /**
 379          * Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
 380          * Have h0+...+2^230 h9 between 0 and 2^255-1;
 381          * evidently 2^255 h10-2^255 q = 0.
 382          * Goal: Output h0+...+2^230 h9.
 383          */
 384         s[0]  = h0 >> 0;
 385         s[1]  = h0 >> 8;
 386         s[2]  = h0 >> 16;
 387         s[3]  = (h0 >> 24) | (h1 * ((uint32_t) 1 << 2));
 388         s[4]  = h1 >> 6;
 389         s[5]  = h1 >> 14;
 390         s[6]  = (h1 >> 22) | (h2 * ((uint32_t) 1 << 3));
 391         s[7]  = h2 >> 5;
 392         s[8]  = h2 >> 13;
 393         s[9]  = (h2 >> 21) | (h3 * ((uint32_t) 1 << 5));
 394         s[10] = h3 >> 3;
 395         s[11] = h3 >> 11;
 396         s[12] = (h3 >> 19) | (h4 * ((uint32_t) 1 << 6));
 397         s[13] = h4 >> 2;
 398         s[14] = h4 >> 10;
 399         s[15] = h4 >> 18;
 400         s[16] = h5 >> 0;
 401         s[17] = h5 >> 8;
 402         s[18] = h5 >> 16;
 403         s[19] = (h5 >> 24) | (h6 * ((uint32_t) 1 << 1));
 404         s[20] = h6 >> 7;
 405         s[21] = h6 >> 15;
 406         s[22] = (h6 >> 23) | (h7 * ((uint32_t) 1 << 3));
 407         s[23] = h7 >> 5;
 408         s[24] = h7 >> 13;
 409         s[25] = (h7 >> 21) | (h8 * ((uint32_t) 1 << 4));
 410         s[26] = h8 >> 4;
 411         s[27] = h8 >> 12;
 412         s[28] = (h8 >> 20) | (h9 * ((uint32_t) 1 << 6));
 413         s[29] = h9 >> 2;
 414         s[30] = h9 >> 10;
 415         s[31] = h9 >> 18;
 416 }
 417
 418 /**
 419  * return 1 if f is in {1,3,5,...,q-2}
 420  * return 0 if f is in {0,2,4,...,q-1}
 421  *
 422  * Preconditions:
 423  * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
 424  */
 425 static int fe_isnegative(const fe f)
 426 {
 427         uint8_t s[32];
 428
 429         fe_tobytes(s,f);
 430
 431         return s[0] & 1;
 432 }
 433
 434 /**
 435  * return 1 if f != 0
 436  * return 0 if f == 0
 437  *
 438  * Preconditions:
 439  * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25,etc.
 440  */
 441 static uint8_t zero[32];
 442
 443 static int fe_isnonzero(const fe f)
 444 {
 445         uint8_t s[32];
 446
 447         fe_tobytes(s, f);
 448
 449         return !memeq_const(s, zero, 32);
 450 }
 451
 452 /**
 453  * h = f * g
 454  * Can overlap h with f or g.
 455  *
 456  * Preconditions:
 457  * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
 458  * |g| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
 459  *
 460  * Postconditions:
 461  * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
 462  */
 463
 464 /**
 465  * Notes on implementation strategy:
 466  *
 467  * Using schoolbook multiplication.
 468  * Karatsuba would save a little in some cost models.
 469  *
 470  * Most multiplications by 2 and 19 are 32-bit precomputations;
 471  * cheaper than 64-bit postcomputations.
 472  *
 473  * There is one remaining multiplication by 19 in the carry chain;
 474  * one *19 precomputation can be merged into this,
 475  * but the resulting data flow is considerably less clean.
 476  *
 477  * There are 12 carries below.
 478  * 10 of them are 2-way parallelizable and vectorizable.
 479  * Can get away with 11 carries, but then data flow is much deeper.
 480  *
 481  * With tighter constraints on inputs can squeeze carries into int32.
 482  */
 483
 484 static void fe_mul(fe h, const fe f, const fe g)
 485 {
 486         int32_t f0 = f[0];
 487         int32_t f1 = f[1];
 488         int32_t f2 = f[2];
 489         int32_t f3 = f[3];
 490         int32_t f4 = f[4];
 491         int32_t f5 = f[5];
 492         int32_t f6 = f[6];
 493         int32_t f7 = f[7];
 494         int32_t f8 = f[8];
 495         int32_t f9 = f[9];
 496
 497         int32_t g0 = g[0];
 498         int32_t g1 = g[1];
 499         int32_t g2 = g[2];
 500         int32_t g3 = g[3];
 501         int32_t g4 = g[4];
 502         int32_t g5 = g[5];
 503         int32_t g6 = g[6];
 504         int32_t g7 = g[7];
 505         int32_t g8 = g[8];
 506         int32_t g9 = g[9];
 507
 508         int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
 509         int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
 510         int32_t g3_19 = 19 * g3;
 511         int32_t g4_19 = 19 * g4;
 512         int32_t g5_19 = 19 * g5;
 513         int32_t g6_19 = 19 * g6;
 514         int32_t g7_19 = 19 * g7;
 515         int32_t g8_19 = 19 * g8;
 516         int32_t g9_19 = 19 * g9;
 517
 518         int32_t f1_2 = 2 * f1;
 519         int32_t f3_2 = 2 * f3;
 520         int32_t f5_2 = 2 * f5;
 521         int32_t f7_2 = 2 * f7;
 522         int32_t f9_2 = 2 * f9;
 523
 524         int64_t f0g0    = f0   * (int64_t) g0;
 525         int64_t f0g1    = f0   * (int64_t) g1;
 526         int64_t f0g2    = f0   * (int64_t) g2;
 527         int64_t f0g3    = f0   * (int64_t) g3;
 528         int64_t f0g4    = f0   * (int64_t) g4;
 529         int64_t f0g5    = f0   * (int64_t) g5;
 530         int64_t f0g6    = f0   * (int64_t) g6;
 531         int64_t f0g7    = f0   * (int64_t) g7;
 532         int64_t f0g8    = f0   * (int64_t) g8;
 533         int64_t f0g9    = f0   * (int64_t) g9;
 534
 535         int64_t f1g0    = f1   * (int64_t) g0;
 536         int64_t f1g1_2  = f1_2 * (int64_t) g1;
 537         int64_t f1g2    = f1   * (int64_t) g2;
 538         int64_t f1g3_2  = f1_2 * (int64_t) g3;
 539         int64_t f1g4    = f1   * (int64_t) g4;
 540         int64_t f1g5_2  = f1_2 * (int64_t) g5;
 541         int64_t f1g6    = f1   * (int64_t) g6;
 542         int64_t f1g7_2  = f1_2 * (int64_t) g7;
 543         int64_t f1g8    = f1   * (int64_t) g8;
 544         int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
 545
 546         int64_t f2g0    = f2   * (int64_t) g0;
 547         int64_t f2g1    = f2   * (int64_t) g1;
 548         int64_t f2g2    = f2   * (int64_t) g2;
 549         int64_t f2g3    = f2   * (int64_t) g3;
 550         int64_t f2g4    = f2   * (int64_t) g4;
 551         int64_t f2g5    = f2   * (int64_t) g5;
 552         int64_t f2g6    = f2   * (int64_t) g6;
 553         int64_t f2g7    = f2   * (int64_t) g7;
 554         int64_t f2g8_19 = f2   * (int64_t) g8_19;
 555         int64_t f2g9_19 = f2   * (int64_t) g9_19;
 556
 557         int64_t f3g0    = f3   * (int64_t) g0;
 558         int64_t f3g1_2  = f3_2 * (int64_t) g1;
 559         int64_t f3g2    = f3   * (int64_t) g2;
 560         int64_t f3g3_2  = f3_2 * (int64_t) g3;
 561         int64_t f3g4    = f3   * (int64_t) g4;
 562         int64_t f3g5_2  = f3_2 * (int64_t) g5;
 563         int64_t f3g6    = f3   * (int64_t) g6;
 564         int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
 565         int64_t f3g8_19 = f3   * (int64_t) g8_19;
 566         int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
 567
 568         int64_t f4g0    = f4   * (int64_t) g0;
 569         int64_t f4g1    = f4   * (int64_t) g1;
 570         int64_t f4g2    = f4   * (int64_t) g2;
 571         int64_t f4g3    = f4   * (int64_t) g3;
 572         int64_t f4g4    = f4   * (int64_t) g4;
 573         int64_t f4g5    = f4   * (int64_t) g5;
 574         int64_t f4g6_19 = f4   * (int64_t) g6_19;
 575         int64_t f4g7_19 = f4   * (int64_t) g7_19;
 576         int64_t f4g8_19 = f4   * (int64_t) g8_19;
 577         int64_t f4g9_19 = f4   * (int64_t) g9_19;
 578
 579         int64_t f5g0    = f5   * (int64_t) g0;
 580         int64_t f5g1_2  = f5_2 * (int64_t) g1;
 581         int64_t f5g2    = f5   * (int64_t) g2;
 582         int64_t f5g3_2  = f5_2 * (int64_t) g3;
 583         int64_t f5g4    = f5   * (int64_t) g4;
 584         int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
 585         int64_t f5g6_19 = f5   * (int64_t) g6_19;
 586         int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
 587         int64_t f5g8_19 = f5   * (int64_t) g8_19;
 588         int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
 589
 590         int64_t f6g0    = f6   * (int64_t) g0;
 591         int64_t f6g1    = f6   * (int64_t) g1;
 592         int64_t f6g2    = f6   * (int64_t) g2;
 593         int64_t f6g3    = f6   * (int64_t) g3;
 594         int64_t f6g4_19 = f6   * (int64_t) g4_19;
 595         int64_t f6g5_19 = f6   * (int64_t) g5_19;
 596         int64_t f6g6_19 = f6   * (int64_t) g6_19;
 597         int64_t f6g7_19 = f6   * (int64_t) g7_19;
 598         int64_t f6g8_19 = f6   * (int64_t) g8_19;
 599         int64_t f6g9_19 = f6   * (int64_t) g9_19;
 600
 601         int64_t f7g0    = f7   * (int64_t) g0;
 602         int64_t f7g1_2  = f7_2 * (int64_t) g1;
 603         int64_t f7g2    = f7   * (int64_t) g2;
 604         int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
 605         int64_t f7g4_19 = f7   * (int64_t) g4_19;
 606         int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
 607         int64_t f7g6_19 = f7   * (int64_t) g6_19;
 608         int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
 609         int64_t f7g8_19 = f7   * (int64_t) g8_19;
 610         int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
 611
 612         int64_t f8g0    = f8   * (int64_t) g0;
 613         int64_t f8g1    = f8   * (int64_t) g1;
 614         int64_t f8g2_19 = f8   * (int64_t) g2_19;
 615         int64_t f8g3_19 = f8   * (int64_t) g3_19;
 616         int64_t f8g4_19 = f8   * (int64_t) g4_19;
 617         int64_t f8g5_19 = f8   * (int64_t) g5_19;
 618         int64_t f8g6_19 = f8   * (int64_t) g6_19;
 619         int64_t f8g7_19 = f8   * (int64_t) g7_19;
 620         int64_t f8g8_19 = f8   * (int64_t) g8_19;
 621         int64_t f8g9_19 = f8   * (int64_t) g9_19;
 622
 623         int64_t f9g0    = f9   * (int64_t) g0;
 624         int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
 625         int64_t f9g2_19 = f9   * (int64_t) g2_19;
 626         int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
 627         int64_t f9g4_19 = f9   * (int64_t) g4_19;
 628         int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
 629         int64_t f9g6_19 = f9   * (int64_t) g6_19;
 630         int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
 631         int64_t f9g8_19 = f9   * (int64_t) g8_19;
 632         int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
 633
 634         int64_t h0 = f0g0    + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 +
 635                      f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
 636         int64_t h1 = f0g1    + f1g0    + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 +
 637                      f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
 638         int64_t h2 = f0g2    + f1g1_2  + f2g0    + f3g9_38 + f4g8_19 + f5g7_38 +
 639                      f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
 640         int64_t h3 = f0g3    + f1g2    + f2g1    + f3g0    + f4g9_19 + f5g8_19 +
 641                      f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
 642         int64_t h4 = f0g4    + f1g3_2  + f2g2    + f3g1_2  + f4g0    + f5g9_38 +
 643                      f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
 644         int64_t h5 = f0g5    + f1g4    + f2g3    + f3g2    + f4g1    + f5g0    +
 645                      f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
 646         int64_t h6 = f0g6    + f1g5_2  + f2g4    + f3g3_2  + f4g2    + f5g1_2  +
 647                      f6g0    + f7g9_38 + f8g8_19 + f9g7_38;
 648         int64_t h7 = f0g7    + f1g6    + f2g5    + f3g4    + f4g3    + f5g2    +
 649                      f6g1    + f7g0    + f8g9_19 + f9g8_19;
 650         int64_t h8 = f0g8    + f1g7_2  + f2g6    + f3g5_2  + f4g4    + f5g3_2  +
 651                      f6g2    + f7g1_2  + f8g0    + f9g9_38;
 652         int64_t h9 = f0g9    + f1g8    + f2g7    + f3g6    + f4g5    + f5g4    +
 653                      f6g3    + f7g2    + f8g1    + f9g0 ;
 654
 655         int64_t carry0, carry1, carry2, carry3, carry4;
 656         int64_t carry5, carry6, carry7, carry8, carry9;
 657
 658         /**
 659          * |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
 660          * i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
 661          * |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
 662          * i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
 663          */
 664
 665         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
 666         h1 += carry0;
 667         h0 -= carry0 * ((uint64_t) 1L << 26);
 668         /* |h0| <= 2^25 */
 669         /* |h1| <= 1.71*2^59 */
 670
 671         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
 672         h5 += carry4;
 673         h4 -= carry4 * ((uint64_t) 1L << 26);
 674         /* |h4| <= 2^25 */
 675         /* |h5| <= 1.71*2^59 */
 676
 677         carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
 678         h2 += carry1;
 679         h1 -= carry1 * ((uint64_t) 1L << 25);
 680         /* |h1| <= 2^24; from now on fits into int32 */
 681         /* |h2| <= 1.41*2^60 */
 682
 683         carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
 684         h6 += carry5;
 685         h5 -= carry5 * ((uint64_t) 1L << 25);
 686         /* |h5| <= 2^24; from now on fits into int32 */
 687         /* |h6| <= 1.41*2^60 */
 688
 689         carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
 690         h3 += carry2;
 691         h2 -= carry2 * ((uint64_t) 1L << 26);
 692         /* |h2| <= 2^25; from now on fits into int32 unchanged */
 693         /* |h3| <= 1.71*2^59 */
 694
 695         carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
 696         h7 += carry6;
 697         h6 -= carry6 * ((uint64_t) 1L << 26);
 698         /* |h6| <= 2^25; from now on fits into int32 unchanged */
 699         /* |h7| <= 1.71*2^59 */
 700
 701         carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
 702         h4 += carry3;
 703         h3 -= carry3 * ((uint64_t) 1L << 25);
 704         /* |h3| <= 2^24; from now on fits into int32 unchanged */
 705         /* |h4| <= 1.72*2^34 */
 706
 707         carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
 708         h8 += carry7;
 709         h7 -= carry7 * ((uint64_t) 1L << 25);
 710         /* |h7| <= 2^24; from now on fits into int32 unchanged */
 711         /* |h8| <= 1.41*2^60 */
 712
 713         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
 714         h5 += carry4;
 715         h4 -= carry4 * ((uint64_t) 1L << 26);
 716         /* |h4| <= 2^25; from now on fits into int32 unchanged */
 717         /* |h5| <= 1.01*2^24 */
 718
 719         carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
 720         h9 += carry8;
 721         h8 -= carry8 * ((uint64_t) 1L << 26);
 722         /* |h8| <= 2^25; from now on fits into int32 unchanged */
 723         /* |h9| <= 1.71*2^59 */
 724
 725         carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
 726         h0 += carry9 * 19;
 727         h9 -= carry9 * ((uint64_t) 1L << 25);
 728         /* |h9| <= 2^24; from now on fits into int32 unchanged */
 729         /* |h0| <= 1.1*2^39 */
 730
 731         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
 732         h1 += carry0;
 733         h0 -= carry0 * ((uint64_t) 1L << 26);
 734         /* |h0| <= 2^25; from now on fits into int32 unchanged */
 735         /* |h1| <= 1.01*2^24 */
 736
 737         h[0] = (int32_t) h0;
 738         h[1] = (int32_t) h1;
 739         h[2] = (int32_t) h2;
 740         h[3] = (int32_t) h3;
 741         h[4] = (int32_t) h4;
 742         h[5] = (int32_t) h5;
 743         h[6] = (int32_t) h6;
 744         h[7] = (int32_t) h7;
 745         h[8] = (int32_t) h8;
 746         h[9] = (int32_t) h9;
 747 }
 748
 749 /**
 750  * h = -f
 751  *
 752  * Preconditions:
 753  * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
 754  *
 755  * Postconditions:
 756  * |h| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
 757  */
 758 static void fe_neg(fe h,const fe f)
 759 {
 760         int32_t f0 = f[0];
 761         int32_t f1 = f[1];
 762         int32_t f2 = f[2];
 763         int32_t f3 = f[3];
 764         int32_t f4 = f[4];
 765         int32_t f5 = f[5];
 766         int32_t f6 = f[6];
 767         int32_t f7 = f[7];
 768         int32_t f8 = f[8];
 769         int32_t f9 = f[9];
 770
 771         int32_t h0 = -f0;
 772         int32_t h1 = -f1;
 773         int32_t h2 = -f2;
 774         int32_t h3 = -f3;
 775         int32_t h4 = -f4;
 776         int32_t h5 = -f5;
 777         int32_t h6 = -f6;
 778         int32_t h7 = -f7;
 779         int32_t h8 = -f8;
 780         int32_t h9 = -f9;
 781
 782         h[0] = h0;
 783         h[1] = h1;
 784         h[2] = h2;
 785         h[3] = h3;
 786         h[4] = h4;
 787         h[5] = h5;
 788         h[6] = h6;
 789         h[7] = h7;
 790         h[8] = h8;
 791         h[9] = h9;
 792 }
 793
 794 /**
 795  * h = f * f
 796  * Can overlap h with f.
 797  *
 798  * Preconditions:
 799  * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
 800  *
 801  * Postconditions:
 802  * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
 803  *
 804  * See fe_mul.c for discussion of implementation strategy.
 805  */
 806 static void fe_sq(fe h, const fe f)
 807 {
 808         int32_t f0 = f[0];
 809         int32_t f1 = f[1];
 810         int32_t f2 = f[2];
 811         int32_t f3 = f[3];
 812         int32_t f4 = f[4];
 813         int32_t f5 = f[5];
 814         int32_t f6 = f[6];
 815         int32_t f7 = f[7];
 816         int32_t f8 = f[8];
 817         int32_t f9 = f[9];
 818
 819         int32_t f0_2 = 2 * f0;
 820         int32_t f1_2 = 2 * f1;
 821         int32_t f2_2 = 2 * f2;
 822         int32_t f3_2 = 2 * f3;
 823         int32_t f4_2 = 2 * f4;
 824         int32_t f5_2 = 2 * f5;
 825         int32_t f6_2 = 2 * f6;
 826         int32_t f7_2 = 2 * f7;
 827
 828         int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
 829         int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
 830         int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
 831         int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
 832         int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
 833
 834         int64_t f0f0    = f0   * (int64_t) f0;
 835         int64_t f0f1_2  = f0_2 * (int64_t) f1;
 836         int64_t f0f2_2  = f0_2 * (int64_t) f2;
 837         int64_t f0f3_2  = f0_2 * (int64_t) f3;
 838         int64_t f0f4_2  = f0_2 * (int64_t) f4;
 839         int64_t f0f5_2  = f0_2 * (int64_t) f5;
 840         int64_t f0f6_2  = f0_2 * (int64_t) f6;
 841         int64_t f0f7_2  = f0_2 * (int64_t) f7;
 842         int64_t f0f8_2  = f0_2 * (int64_t) f8;
 843         int64_t f0f9_2  = f0_2 * (int64_t) f9;
 844
 845         int64_t f1f1_2  = f1_2 * (int64_t) f1;
 846         int64_t f1f2_2  = f1_2 * (int64_t) f2;
 847         int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
 848         int64_t f1f4_2  = f1_2 * (int64_t) f4;
 849         int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
 850         int64_t f1f6_2  = f1_2 * (int64_t) f6;
 851         int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
 852         int64_t f1f8_2  = f1_2 * (int64_t) f8;
 853         int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
 854
 855         int64_t f2f2    = f2   * (int64_t) f2;
 856         int64_t f2f3_2  = f2_2 * (int64_t) f3;
 857         int64_t f2f4_2  = f2_2 * (int64_t) f4;
 858         int64_t f2f5_2  = f2_2 * (int64_t) f5;
 859         int64_t f2f6_2  = f2_2 * (int64_t) f6;
 860         int64_t f2f7_2  = f2_2 * (int64_t) f7;
 861         int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
 862         int64_t f2f9_38 = f2   * (int64_t) f9_38;
 863
 864         int64_t f3f3_2  = f3_2 * (int64_t) f3;
 865         int64_t f3f4_2  = f3_2 * (int64_t) f4;
 866         int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
 867         int64_t f3f6_2  = f3_2 * (int64_t) f6;
 868         int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
 869         int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
 870         int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
 871
 872         int64_t f4f4    = f4   * (int64_t) f4;
 873         int64_t f4f5_2  = f4_2 * (int64_t) f5;
 874         int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
 875         int64_t f4f7_38 = f4   * (int64_t) f7_38;
 876         int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
 877         int64_t f4f9_38 = f4   * (int64_t) f9_38;
 878
 879         int64_t f5f5_38 = f5   * (int64_t) f5_38;
 880         int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
 881         int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
 882         int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
 883         int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
 884
 885         int64_t f6f6_19 = f6   * (int64_t) f6_19;
 886         int64_t f6f7_38 = f6   * (int64_t) f7_38;
 887         int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
 888         int64_t f6f9_38 = f6   * (int64_t) f9_38;
 889
 890         int64_t f7f7_38 = f7   * (int64_t) f7_38;
 891         int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
 892         int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
 893
 894         int64_t f8f8_19 = f8   * (int64_t) f8_19;
 895         int64_t f8f9_38 = f8   * (int64_t) f9_38;
 896
 897         int64_t f9f9_38 = f9   * (int64_t) f9_38;
 898
 899         int64_t h0 = f0f0   + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
 900         int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
 901         int64_t h2 = f0f2_2 + f1f1_2  + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
 902         int64_t h3 = f0f3_2 + f1f2_2  + f4f9_38 + f5f8_38 + f6f7_38;
 903         int64_t h4 = f0f4_2 + f1f3_4  + f2f2    + f5f9_76 + f6f8_38 + f7f7_38;
 904         int64_t h5 = f0f5_2 + f1f4_2  + f2f3_2  + f6f9_38 + f7f8_38;
 905         int64_t h6 = f0f6_2 + f1f5_4  + f2f4_2  + f3f3_2  + f7f9_76 + f8f8_19;
 906         int64_t h7 = f0f7_2 + f1f6_2  + f2f5_2  + f3f4_2  + f8f9_38;
 907         int64_t h8 = f0f8_2 + f1f7_4  + f2f6_2  + f3f5_4  + f4f4    + f9f9_38;
 908         int64_t h9 = f0f9_2 + f1f8_2  + f2f7_2  + f3f6_2  + f4f5_2;
 909
 910         int64_t carry0, carry1, carry2, carry3, carry4;
 911         int64_t carry5, carry6, carry7, carry8, carry9;
 912
 913         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
 914         h1 += carry0;
 915         h0 -= carry0 * ((uint64_t) 1L << 26);
 916
 917         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
 918         h5 += carry4;
 919         h4 -= carry4 * ((uint64_t) 1L << 26);
 920
 921         carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
 922         h2 += carry1;
 923         h1 -= carry1 * ((uint64_t) 1L << 25);
 924
 925         carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
 926         h6 += carry5;
 927         h5 -= carry5 * ((uint64_t) 1L << 25);
 928
 929         carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
 930         h3 += carry2;
 931         h2 -= carry2 * ((uint64_t) 1L << 26);
 932
 933         carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
 934         h7 += carry6;
 935         h6 -= carry6 * ((uint64_t) 1L << 26);
 936
 937         carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
 938         h4 += carry3;
 939         h3 -= carry3 * ((uint64_t) 1L << 25);
 940
 941         carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
 942         h8 += carry7;
 943         h7 -= carry7 * ((uint64_t) 1L << 25);
 944
 945         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
 946         h5 += carry4;
 947         h4 -= carry4 * ((uint64_t) 1L << 26);
 948
 949         carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
 950         h9 += carry8;
 951         h8 -= carry8 * ((uint64_t) 1L << 26);
 952
 953         carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
 954         h0 += carry9 * 19;
 955         h9 -= carry9 * ((uint64_t) 1L << 25);
 956
 957         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
 958         h1 += carry0;
 959         h0 -= carry0 * ((uint64_t) 1L << 26);
 960
 961         h[0] = (int32_t) h0;
 962         h[1] = (int32_t) h1;
 963         h[2] = (int32_t) h2;
 964         h[3] = (int32_t) h3;
 965         h[4] = (int32_t) h4;
 966         h[5] = (int32_t) h5;
 967         h[6] = (int32_t) h6;
 968         h[7] = (int32_t) h7;
 969         h[8] = (int32_t) h8;
 970         h[9] = (int32_t) h9;
 971 }
 972
 973 /**
 974  * h = 2 * f * f
 975  * Can overlap h with f.
 976  *
 977  * Preconditions:
 978  *|f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
 979  *
 980  * Postconditions:
 981  * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
 982  *
 983  * See fe_mul.c for discussion of implementation strategy.
 984  */
 985 static void fe_sq2(fe h, const fe f)
 986 {
 987         int32_t f0 = f[0];
 988         int32_t f1 = f[1];
 989         int32_t f2 = f[2];
 990         int32_t f3 = f[3];
 991         int32_t f4 = f[4];
 992         int32_t f5 = f[5];
 993         int32_t f6 = f[6];
 994         int32_t f7 = f[7];
 995         int32_t f8 = f[8];
 996         int32_t f9 = f[9];
 997
 998         int32_t f0_2 = 2 * f0;
 999         int32_t f1_2 = 2 * f1;
1000         int32_t f2_2 = 2 * f2;
1001         int32_t f3_2 = 2 * f3;
1002         int32_t f4_2 = 2 * f4;
1003         int32_t f5_2 = 2 * f5;
1004         int32_t f6_2 = 2 * f6;
1005         int32_t f7_2 = 2 * f7;
1006
1007         int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1008         int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1009         int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1010         int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1011         int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1012
1013         int64_t f0f0    = f0   * (int64_t) f0;
1014         int64_t f0f1_2  = f0_2 * (int64_t) f1;
1015         int64_t f0f2_2  = f0_2 * (int64_t) f2;
1016         int64_t f0f3_2  = f0_2 * (int64_t) f3;
1017         int64_t f0f4_2  = f0_2 * (int64_t) f4;
1018         int64_t f0f5_2  = f0_2 * (int64_t) f5;
1019         int64_t f0f6_2  = f0_2 * (int64_t) f6;
1020         int64_t f0f7_2  = f0_2 * (int64_t) f7;
1021         int64_t f0f8_2  = f0_2 * (int64_t) f8;
1022         int64_t f0f9_2  = f0_2 * (int64_t) f9;
1023
1024         int64_t f1f1_2  = f1_2 * (int64_t) f1;
1025         int64_t f1f2_2  = f1_2 * (int64_t) f2;
1026         int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
1027         int64_t f1f4_2  = f1_2 * (int64_t) f4;
1028         int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
1029         int64_t f1f6_2  = f1_2 * (int64_t) f6;
1030         int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
1031         int64_t f1f8_2  = f1_2 * (int64_t) f8;
1032         int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1033
1034         int64_t f2f2    = f2   * (int64_t) f2;
1035         int64_t f2f3_2  = f2_2 * (int64_t) f3;
1036         int64_t f2f4_2  = f2_2 * (int64_t) f4;
1037         int64_t f2f5_2  = f2_2 * (int64_t) f5;
1038         int64_t f2f6_2  = f2_2 * (int64_t) f6;
1039         int64_t f2f7_2  = f2_2 * (int64_t) f7;
1040         int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1041         int64_t f2f9_38 = f2   * (int64_t) f9_38;
1042
1043         int64_t f3f3_2  = f3_2 * (int64_t) f3;
1044         int64_t f3f4_2  = f3_2 * (int64_t) f4;
1045         int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
1046         int64_t f3f6_2  = f3_2 * (int64_t) f6;
1047         int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1048         int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1049         int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1050
1051         int64_t f4f4    = f4   * (int64_t) f4;
1052         int64_t f4f5_2  = f4_2 * (int64_t) f5;
1053         int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1054         int64_t f4f7_38 = f4   * (int64_t) f7_38;
1055         int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1056         int64_t f4f9_38 = f4   * (int64_t) f9_38;
1057
1058         int64_t f5f5_38 = f5   * (int64_t) f5_38;
1059         int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1060         int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1061         int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1062         int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1063
1064         int64_t f6f6_19 = f6   * (int64_t) f6_19;
1065         int64_t f6f7_38 = f6   * (int64_t) f7_38;
1066         int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1067         int64_t f6f9_38 = f6   * (int64_t) f9_38;
1068
1069         int64_t f7f7_38 = f7   * (int64_t) f7_38;
1070         int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1071         int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1072
1073         int64_t f8f8_19 = f8   * (int64_t) f8_19;
1074         int64_t f8f9_38 = f8   * (int64_t) f9_38;
1075
1076         int64_t f9f9_38 = f9   * (int64_t) f9_38;
1077
1078         int64_t h0 = f0f0   + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1079         int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1080         int64_t h2 = f0f2_2 + f1f1_2  + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1081         int64_t h3 = f0f3_2 + f1f2_2  + f4f9_38 + f5f8_38 + f6f7_38;
1082         int64_t h4 = f0f4_2 + f1f3_4  + f2f2    + f5f9_76 + f6f8_38 + f7f7_38;
1083         int64_t h5 = f0f5_2 + f1f4_2  + f2f3_2  + f6f9_38 + f7f8_38;
1084         int64_t h6 = f0f6_2 + f1f5_4  + f2f4_2  + f3f3_2  + f7f9_76 + f8f8_19;
1085         int64_t h7 = f0f7_2 + f1f6_2  + f2f5_2  + f3f4_2  + f8f9_38;
1086         int64_t h8 = f0f8_2 + f1f7_4  + f2f6_2  + f3f5_4  + f4f4    + f9f9_38;
1087         int64_t h9 = f0f9_2 + f1f8_2  + f2f7_2  + f3f6_2  + f4f5_2;
1088
1089         int64_t carry0, carry1, carry2, carry3, carry4;
1090         int64_t carry5, carry6, carry7, carry8, carry9;
1091
1092         h0 += h0;
1093         h1 += h1;
1094         h2 += h2;
1095         h3 += h3;
1096         h4 += h4;
1097         h5 += h5;
1098         h6 += h6;
1099         h7 += h7;
1100         h8 += h8;
1101         h9 += h9;
1102
1103         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
1104         h1 += carry0;
1105         h0 -= carry0 * ((uint64_t) 1L << 26);
1106
1107         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
1108         h5 += carry4;
1109         h4 -= carry4 * ((uint64_t) 1L << 26);
1110
1111         carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
1112         h2 += carry1;
1113         h1 -= carry1 * ((uint64_t) 1L << 25);
1114
1115         carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
1116         h6 += carry5;
1117         h5 -= carry5 * ((uint64_t) 1L << 25);
1118
1119         carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
1120         h3 += carry2;
1121         h2 -= carry2 * ((uint64_t) 1L << 26);
1122
1123         carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
1124         h7 += carry6;
1125         h6 -= carry6 * ((uint64_t) 1L << 26);
1126
1127         carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
1128         h4 += carry3;
1129         h3 -= carry3 * ((uint64_t) 1L << 25);
1130
1131         carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
1132         h8 += carry7;
1133         h7 -= carry7 * ((uint64_t) 1L << 25);
1134
1135         carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
1136         h5 += carry4;
1137         h4 -= carry4 * ((uint64_t) 1L << 26);
1138
1139         carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
1140         h9 += carry8;
1141         h8 -= carry8 * ((uint64_t) 1L << 26);
1142
1143         carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
1144         h0 += carry9 * 19;
1145         h9 -= carry9 * ((uint64_t) 1L << 25);
1146
1147         carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
1148         h1 += carry0;
1149         h0 -= carry0 * ((uint64_t) 1L << 26);
1150
1151         h[0] = (int32_t) h0;
1152         h[1] = (int32_t) h1;
1153         h[2] = (int32_t) h2;
1154         h[3] = (int32_t) h3;
1155         h[4] = (int32_t) h4;
1156         h[5] = (int32_t) h5;
1157         h[6] = (int32_t) h6;
1158         h[7] = (int32_t) h7;
1159         h[8] = (int32_t) h8;
1160         h[9] = (int32_t) h9;
1161 }
1162
1163 static void fe_invert(fe out, const fe z)
1164 {
1165         fe t0, t1, t2, t3;
1166         int i;
1167
1168         fe_sq(t0, z);
1169         fe_sq(t1, t0);
1170         fe_sq(t1, t1);
1171         fe_mul(t1, z, t1);
1172         fe_mul(t0, t0, t1);
1173         fe_sq(t2, t0);
1174         fe_mul(t1, t1, t2);
1175         fe_sq(t2, t1);
1176
1177         for (i = 1; i < 5; ++i)
1178         {
1179                 fe_sq(t2, t2);
1180         }
1181
1182         fe_mul(t1, t2, t1);
1183         fe_sq(t2, t1);
1184
1185         for (i = 1; i < 10; ++i)
1186         {
1187                 fe_sq(t2, t2);
1188         }
1189
1190         fe_mul(t2, t2, t1);
1191         fe_sq(t3, t2);
1192
1193         for (i = 1; i < 20; ++i)
1194         {
1195                 fe_sq(t3, t3);
1196         }
1197
1198         fe_mul(t2, t3, t2);
1199         fe_sq(t2, t2);
1200
1201         for (i = 1; i < 10; ++i)
1202         {
1203                 fe_sq(t2, t2);
1204         }
1205
1206         fe_mul(t1, t2, t1);
1207         fe_sq(t2, t1);
1208
1209         for (i = 1; i < 50; ++i)
1210         {
1211                 fe_sq(t2, t2);
1212         }
1213
1214         fe_mul(t2, t2, t1);
1215         fe_sq(t3, t2);
1216
1217         for (i = 1; i < 100; ++i)
1218         {
1219                 fe_sq(t3, t3);
1220         }
1221
1222         fe_mul(t2, t3, t2);
1223         fe_sq(t2, t2);
1224
1225         for (i = 1; i < 50; ++i)
1226         {
1227                 fe_sq(t2, t2);
1228         }
1229
1230         fe_mul(t1, t2, t1);
1231         fe_sq(t1, t1);
1232
1233         for (i = 1; i < 5; ++i)
1234         {
1235                 fe_sq(t1, t1);
1236         }
1237
1238         fe_mul(out, t1, t0);
1239 }
1240
1241 static void fe_pow22523(fe out, const fe z)
1242 {
1243         fe t0, t1, t2;
1244         int i;
1245
1246         fe_sq(t0, z);
1247         fe_sq(t1, t0);
1248         fe_sq(t1, t1);
1249         fe_mul(t1, z, t1);
1250         fe_mul(t0, t0, t1);
1251         fe_sq(t0, t0);
1252         fe_mul(t0, t1, t0);
1253         fe_sq(t1, t0);
1254
1255         for (i = 1; i < 5; ++i)
1256         {
1257                 fe_sq(t1, t1);
1258         }
1259
1260         fe_mul(t0, t1, t0);
1261         fe_sq(t1, t0);
1262
1263         for (i = 1; i < 10; ++i)
1264         {
1265                 fe_sq(t1, t1);
1266         }
1267
1268         fe_mul(t1, t1, t0);
1269         fe_sq(t2, t1);
1270
1271         for (i = 1; i < 20; ++i)
1272         {
1273                 fe_sq(t2, t2);
1274         }
1275
1276         fe_mul(t1, t2, t1);
1277         fe_sq(t1, t1);
1278
1279         for (i = 1; i < 10; ++i)
1280         {
1281                 fe_sq(t1, t1);
1282         }
1283
1284         fe_mul(t0, t1, t0);
1285         fe_sq(t1, t0);
1286
1287         for (i = 1; i < 50; ++i)
1288         {
1289                 fe_sq(t1, t1);
1290         }
1291
1292         fe_mul(t1, t1, t0);
1293         fe_sq(t2, t1);
1294         for (i = 1; i < 100; ++i)
1295         {
1296                 fe_sq(t2, t2);
1297         }
1298
1299         fe_mul(t1, t2, t1);
1300         fe_sq(t1, t1);
1301
1302         for (i = 1; i < 50; ++i)
1303         {
1304                 fe_sq(t1, t1);
1305         }
1306
1307         fe_mul(t0, t1, t0);
1308         fe_sq(t0, t0);
1309         fe_sq(t0, t0);
1310         fe_mul(out, t0, z);
1311 }
1312
1313 /**
1314  * h = f - g
1315  * Can overlap h with f or g.
1316  *
1317  * Preconditions:
1318  * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
1319  * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
1320  *
1321  * Postconditions:
1322  * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
1323  */
1324 static void fe_sub(fe h, const fe f, const fe g)
1325 {
1326         int32_t f0 = f[0];
1327         int32_t f1 = f[1];
1328         int32_t f2 = f[2];
1329         int32_t f3 = f[3];
1330         int32_t f4 = f[4];
1331         int32_t f5 = f[5];
1332         int32_t f6 = f[6];
1333         int32_t f7 = f[7];
1334         int32_t f8 = f[8];
1335         int32_t f9 = f[9];
1336
1337         int32_t g0 = g[0];
1338         int32_t g1 = g[1];
1339         int32_t g2 = g[2];
1340         int32_t g3 = g[3];
1341         int32_t g4 = g[4];
1342         int32_t g5 = g[5];
1343         int32_t g6 = g[6];
1344         int32_t g7 = g[7];
1345         int32_t g8 = g[8];
1346         int32_t g9 = g[9];
1347
1348         int32_t h0 = f0 - g0;
1349         int32_t h1 = f1 - g1;
1350         int32_t h2 = f2 - g2;
1351         int32_t h3 = f3 - g3;
1352         int32_t h4 = f4 - g4;
1353         int32_t h5 = f5 - g5;
1354         int32_t h6 = f6 - g6;
1355         int32_t h7 = f7 - g7;
1356         int32_t h8 = f8 - g8;
1357         int32_t h9 = f9 - g9;
1358
1359         h[0] = h0;
1360         h[1] = h1;
1361         h[2] = h2;
1362         h[3] = h3;
1363         h[4] = h4;
1364         h[5] = h5;
1365         h[6] = h6;
1366         h[7] = h7;
1367         h[8] = h8;
1368         h[9] = h9;
1369 }
1370
1371 /**
1372  * r = p + q
1373  */
1374 static void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
1375 {
1376         fe t0;
1377
1378         fe_add(r->X, p->Y, p->X);
1379         fe_sub(r->Y, p->Y, p->X);
1380         fe_mul(r->Z, r->X, q->YplusX);
1381         fe_mul(r->Y, r->Y, q->YminusX);
1382         fe_mul(r->T, q->T2d, p->T);
1383         fe_mul(r->X, p->Z, q->Z);
1384         fe_add(t0, r->X, r->X);
1385         fe_sub(r->X, r->Z, r->Y);
1386         fe_add(r->Y, r->Z, r->Y);
1387         fe_add(r->Z, t0, r->T);
1388         fe_sub(r->T, t0, r->T);
1389 }
1390
1391 static void slide(int8_t *r, const uint8_t *a)
1392 {
1393         int i, b, k;
1394
1395         for (i = 0; i < 256; ++i)
1396         {
1397                 r[i] = 1 & (a[i >> 3] >> (i & 7));
1398         }
1399
1400         for (i = 0; i < 256; ++i)
1401         {
1402                 if (r[i])
1403                 {
1404                         for (b = 1; b <= 6 && i + b < 256; ++b)
1405                         {
1406                                 if (r[i + b])
1407                                 {
1408                                         if (r[i] + (r[i + b] << b) <= 15)
1409                                         {
1410                                                 r[i] += r[i + b] << b; r[i + b] = 0;
1411                                         }
1412                                         else if (r[i] - (r[i + b] << b) >= -15)
1413                                         {
1414                                                 r[i] -= r[i + b] << b;
1415
1416                                                 for (k = i + b; k < 256; ++k)
1417                                                 {
1418                                                         if (!r[k])
1419                                                         {
1420                                                                 r[k] = 1;
1421                                                                 break;
1422                                                         }
1423                                                         r[k] = 0;
1424                                                 }
1425                                         }
1426                                         else
1427                                         {
1428                                                 break;
1429                                         }
1430                                 }
1431                         }
1432                 }
1433         }
1434 }
1435
1436 static const ge_precomp Bi[8] = {
1437 #include "base2.h"
1438 };
1439
1440 /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
1441 static const fe d = {
1442         -10913610,  13857413, -15372611,   6949391,    114729,
1443          -8787816,  -6275908,  -3247719, -18696448, -12055116
1444 };
1445
1446 /* sqrt(-1) */
1447 static const fe sqrtm1 = {
1448         -32595792,  -7943725,  9377950,    3500415,  12389472,
1449           -272473, -25146209, -2005654,     326686,  11406482
1450 };
1451
1452 int ge_frombytes_negate_vartime(ge_p3 *h, const uint8_t *s)
1453 {
1454         fe u, v, v3, vxx, check;
1455
1456         fe_frombytes(h->Y,s);
1457         fe_1(h->Z);
1458         fe_sq(u,h->Y);
1459         fe_mul(v,u,d);
1460         fe_sub(u,u,h->Z);          /* u = y^2-1 */
1461         fe_add(v,v,h->Z);          /* v = dy^2+1 */
1462
1463         fe_sq(v3,v);
1464         fe_mul(v3,v3,v);                /* v3 = v^3 */
1465         fe_sq(h->X,v3);
1466         fe_mul(h->X,h->X,v);
1467         fe_mul(h->X,h->X,u);    /* x = uv^7 */
1468
1469         fe_pow22523(h->X,h->X); /* x = (uv^7)^((q-5)/8) */
1470         fe_mul(h->X,h->X,v3);
1471         fe_mul(h->X,h->X,u);    /* x = uv^3(uv^7)^((q-5)/8) */
1472
1473         fe_sq(vxx,h->X);
1474         fe_mul(vxx,vxx,v);
1475         fe_sub(check,vxx,u);    /* vx^2-u */
1476
1477         if (fe_isnonzero(check))
1478         {
1479                 fe_add(check,vxx,u);  /* vx^2+u */
1480
1481                 if (fe_isnonzero(check))
1482                 {
1483                         return -1;
1484                 }
1485                 fe_mul(h->X,h->X,sqrtm1);
1486         }
1487
1488         if (fe_isnegative(h->X) == (s[31] >> 7))
1489         {
1490                 fe_neg(h->X,h->X);
1491         }
1492         fe_mul(h->T,h->X,h->Y);
1493
1494         return 0;
1495 }
1496
1497 /**
1498  * r = p + q
1499  */
1500 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
1501 {
1502         fe t0;
1503
1504         fe_add(r->X, p->Y, p->X);
1505         fe_sub(r->Y, p->Y, p->X);
1506         fe_mul(r->Z, r->X, q->yplusx);
1507         fe_mul(r->Y, r->Y, q->yminusx);
1508         fe_mul(r->T, q->xy2d, p->T);
1509         fe_add(t0, p->Z, p->Z);
1510         fe_sub(r->X, r->Z, r->Y);
1511         fe_add(r->Y, r->Z, r->Y);
1512         fe_add(r->Z, t0, r->T);
1513         fe_sub(r->T, t0, r->T);
1514 }
1515
1516 /**
1517  * r = p - q
1518  */
1519 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
1520 {
1521         fe t0;
1522
1523         fe_add(r->X, p->Y, p->X);
1524         fe_sub(r->Y, p->Y, p->X);
1525         fe_mul(r->Z, r->X, q->yminusx);
1526         fe_mul(r->Y, r->Y, q->yplusx);
1527         fe_mul(r->T, q->xy2d, p->T);
1528         fe_add(t0, p->Z, p->Z);
1529         fe_sub(r->X, r->Z, r->Y);
1530         fe_add(r->Y, r->Z, r->Y);
1531         fe_sub(r->Z, t0, r->T);
1532         fe_add(r->T, t0, r->T);
1533 }
1534
1535 /**
1536  * r = p
1537  */
1538 static void ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p)
1539 {
1540         fe_mul(r->X,p->X,p->T);
1541         fe_mul(r->Y,p->Y,p->Z);
1542         fe_mul(r->Z,p->Z,p->T);
1543 }
1544
1545 /**
1546  * r = p
1547  */
1548 static void ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p)
1549 {
1550         fe_mul(r->X,p->X,p->T);
1551         fe_mul(r->Y,p->Y,p->Z);
1552         fe_mul(r->Z,p->Z,p->T);
1553         fe_mul(r->T,p->X,p->Y);
1554 }
1555
1556 static void ge_p2_0(ge_p2 *h)
1557 {
1558         fe_0(h->X);
1559         fe_1(h->Y);
1560         fe_1(h->Z);
1561 }
1562
1563 /**
1564  * r = 2 * p
1565  */
1566 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p)
1567 {
1568         fe t0;
1569
1570         fe_sq(r->X, p->X);
1571         fe_sq(r->Z, p->Y);
1572         fe_sq2(r->T, p->Z);
1573         fe_add(r->Y, p->X, p->Y);
1574         fe_sq(t0, r->Y);
1575         fe_add(r->Y, r->Z, r->X);
1576         fe_sub(r->Z, r->Z, r->X);
1577         fe_sub(r->X, t0, r->Y);
1578         fe_sub(r->T, r->T, r->Z);
1579 }
1580
1581 static void ge_p3_0(ge_p3 *h)
1582 {
1583         fe_0(h->X);
1584         fe_1(h->Y);
1585         fe_1(h->Z);
1586         fe_0(h->T);
1587 }
1588
1589 /**
1590  * r = p
1591  */
1592
1593 /* 2 * d = 16295367250680780974490674513165176452449235426866156013048779062215315747161 */
1594 static const fe d2 = {
1595         -21827239,  -5839606, -30745221, 13898782,  229458,
1596          15978800, -12551817,  -6495438, 29715968, 9444199
1597 };
1598
1599 static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p)
1600 {
1601         fe_add(r->YplusX,p->Y,p->X);
1602         fe_sub(r->YminusX,p->Y,p->X);
1603         fe_copy(r->Z,p->Z);
1604         fe_mul(r->T2d,p->T,d2);
1605 }
1606
1607 /**
1608  * r = p
1609  */
1610 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p)
1611 {
1612         fe_copy(r->X,p->X);
1613         fe_copy(r->Y,p->Y);
1614         fe_copy(r->Z,p->Z);
1615 }
1616
1617 void ge_p3_tobytes(uint8_t *s, const ge_p3 *h)
1618 {
1619         fe recip, x, y;
1620
1621         fe_invert(recip,h->Z);
1622         fe_mul(x,h->X,recip);
1623         fe_mul(y,h->Y,recip);
1624         fe_tobytes(s,y);
1625
1626         s[31] ^= fe_isnegative(x) << 7;
1627 }
1628
1629 /**
1630  * r = 2 * p
1631  */
1632 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p)
1633 {
1634         ge_p2 q;
1635         ge_p3_to_p2(&q,p);
1636         ge_p2_dbl(r,&q);
1637 }
1638
1639 static void ge_precomp_0(ge_precomp *h)
1640 {
1641         fe_1(h->yplusx);
1642         fe_1(h->yminusx);
1643         fe_0(h->xy2d);
1644 }
1645
1646 static uint8_t equal(int8_t b, int8_t c)
1647 {
1648         uint8_t ub = b;
1649         uint8_t uc = c;
1650         uint8_t x = ub ^ uc;  /* 0: yes; 1..255: no */
1651         uint32_t y = x;       /* 0: yes; 1..255: no */
1652
1653         y -= 1;      /* 4294967295: yes; 0..254: no */
1654         y >>= 31;    /* 1: yes; 0: no */
1655
1656         return y;
1657 }
1658
1659 static uint8_t negative(int8_t b)
1660 {
1661         uint64_t x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
1662
1663         x >>= 63;       /* 1: yes; 0: no */
1664
1665         return x;
1666 }
1667
1668 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b)
1669 {
1670         fe_cmov(t->yplusx,u->yplusx,b);
1671         fe_cmov(t->yminusx,u->yminusx,b);
1672         fe_cmov(t->xy2d,u->xy2d,b);
1673 }
1674
1675 /**
1676  * base[i][j] = (j+1)*256^i*B
1677  */
1678 static const ge_precomp base[32][8] = {
1679 #include "base.h"
1680 };
1681
1682 static void ge_select(ge_precomp *t, int pos, int8_t b)
1683 {
1684         ge_precomp minust;
1685         uint8_t bnegative = negative(b);
1686         uint8_t babs = b - (((-bnegative) & b) * ((int8_t) 1 << 1));
1687
1688         ge_precomp_0(t);
1689         cmov(t,&base[pos][0],equal(babs,1));
1690         cmov(t,&base[pos][1],equal(babs,2));
1691         cmov(t,&base[pos][2],equal(babs,3));
1692         cmov(t,&base[pos][3],equal(babs,4));
1693         cmov(t,&base[pos][4],equal(babs,5));
1694         cmov(t,&base[pos][5],equal(babs,6));
1695         cmov(t,&base[pos][6],equal(babs,7));
1696         cmov(t,&base[pos][7],equal(babs,8));
1697         fe_copy(minust.yplusx,t->yminusx);
1698         fe_copy(minust.yminusx,t->yplusx);
1699         fe_neg(minust.xy2d,t->xy2d);
1700         cmov(t,&minust,bnegative);
1701 }
1702
1703 /**
1704  *r = p - q
1705  */
1706 static void ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
1707 {
1708         fe t0;
1709
1710         fe_add(r->X, p->Y, p->X);
1711         fe_sub(r->Y, p->Y, p->X);
1712         fe_mul(r->Z, r->X, q->YminusX);
1713         fe_mul(r->Y, r->Y, q->YplusX);
1714         fe_mul(r->T, q->T2d, p->T);
1715         fe_mul(r->X, p->Z, q->Z);
1716         fe_add(t0, r->X, r->X);
1717         fe_sub(r->X, r->Z, r->Y);
1718         fe_add(r->Y, r->Z, r->Y);
1719         fe_sub(r->Z, t0, r->T);
1720         fe_add(r->T, t0, r->T);
1721 }
1722
1723 void ge_tobytes(uint8_t *s, const ge_p2 *h)
1724 {
1725         fe recip, x, y;
1726
1727         fe_invert(recip,h->Z);
1728         fe_mul(x,h->X,recip);
1729         fe_mul(y,h->Y,recip);
1730         fe_tobytes(s,y);
1731
1732         s[31] ^= fe_isnegative(x) << 7;
1733 }
1734
1735 /**
1736  * h = a * B
1737  * where a = a[0]+256*a[1]+...+256^31 a[31]
1738  * B is the Ed25519 base point (x,4/5) with x positive.
1739  *
1740  * Preconditions:
1741  * a[31] <= 127
1742  */
1743
1744 /**
1745  * r = a * A + b * B
1746  * where a = a[0]+256*a[1]+...+256^31 a[31].
1747  * and b = b[0]+256*b[1]+...+256^31 b[31].
1748  * B is the Ed25519 base point (x,4/5) with x positive.
1749  */
1750 void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, const ge_p3 *A,
1751                                                                   const uint8_t *b)
1752 {
1753         int8_t aslide[256];
1754         int8_t bslide[256];
1755         ge_cached Ai[8];     /* A,3A,5A,7A,9A,11A,13A,15A */
1756         ge_p1p1 t;
1757         ge_p3 u, A2;
1758         int i;
1759
1760         slide(aslide,a);
1761         slide(bslide,b);
1762
1763         ge_p3_to_cached(&Ai[0],A);
1764         ge_p3_dbl(&t,A);
1765         ge_p1p1_to_p3(&A2,&t);
1766
1767         ge_add(&t,&A2,&Ai[0]);
1768         ge_p1p1_to_p3(&u,&t);
1769         ge_p3_to_cached(&Ai[1],&u);
1770
1771         ge_add(&t,&A2,&Ai[1]);
1772         ge_p1p1_to_p3(&u,&t);
1773         ge_p3_to_cached(&Ai[2],&u);
1774
1775         ge_add(&t,&A2,&Ai[2]);
1776         ge_p1p1_to_p3(&u,&t);
1777         ge_p3_to_cached(&Ai[3],&u);
1778
1779         ge_add(&t,&A2,&Ai[3]);
1780         ge_p1p1_to_p3(&u,&t);
1781         ge_p3_to_cached(&Ai[4],&u);
1782
1783         ge_add(&t,&A2,&Ai[4]);
1784         ge_p1p1_to_p3(&u,&t);
1785         ge_p3_to_cached(&Ai[5],&u);
1786
1787         ge_add(&t,&A2,&Ai[5]);
1788         ge_p1p1_to_p3(&u,&t);
1789         ge_p3_to_cached(&Ai[6],&u);
1790
1791         ge_add(&t,&A2,&Ai[6]);
1792         ge_p1p1_to_p3(&u,&t);
1793         ge_p3_to_cached(&Ai[7],&u);
1794
1795         ge_p2_0(r);
1796
1797         for (i = 255; i >= 0; --i)
1798         {
1799                 if (aslide[i] || bslide[i])
1800                 {
1801                         break;
1802                 }
1803         }
1804
1805         for (; i >= 0 ;--i)
1806         {
1807                 ge_p2_dbl(&t,r);
1808
1809                 if (aslide[i] > 0)
1810                 {
1811                         ge_p1p1_to_p3(&u,&t);
1812                         ge_add(&t,&u,&Ai[aslide[i]/2]);
1813                 }
1814                 else if (aslide[i] < 0)
1815                 {
1816                         ge_p1p1_to_p3(&u,&t);
1817                         ge_sub(&t,&u,&Ai[(-aslide[i])/2]);
1818                 }
1819
1820                 if (bslide[i] > 0)
1821                 {
1822                         ge_p1p1_to_p3(&u,&t);
1823                         ge_madd(&t,&u,&Bi[bslide[i]/2]);
1824                 }
1825                 else if (bslide[i] < 0)
1826                 {
1827                         ge_p1p1_to_p3(&u,&t);
1828                         ge_msub(&t,&u,&Bi[(-bslide[i])/2]);
1829                 }
1830                 ge_p1p1_to_p2(r,&t);
1831         }
1832 }
1833
1834 void ge_scalarmult_base(ge_p3 *h, const uint8_t *a)
1835 {
1836         int8_t e[64];
1837         int8_t carry = 0;
1838         ge_p1p1 r;
1839         ge_p2 s;
1840         ge_precomp t;
1841         int i;
1842
1843         for (i = 0; i < 32; ++i)
1844         {
1845                 e[2 * i + 0] = (a[i] >> 0) & 15;
1846                 e[2 * i + 1] = (a[i] >> 4) & 15;
1847         }
1848         /* each e[i] is between 0 and 15 */
1849         /* e[63] is between 0 and 7 */
1850
1851         for (i = 0; i < 63; ++i) {
1852                 e[i] += carry;
1853                 carry = e[i] + 8;
1854                 carry >>= 4;
1855                 e[i] -= carry * ((int8_t) 1 << 4);
1856         }
1857         e[63] += carry;
1858         /* each e[i] is between -8 and 8 */
1859
1860         ge_p3_0(h);
1861         for (i = 1; i < 64; i += 2)
1862         {
1863                 ge_select(&t,i / 2,e[i]);
1864                 ge_madd(&r,h,&t); ge_p1p1_to_p3(h,&r);
1865         }
1866
1867         ge_p3_dbl(&r,h);  ge_p1p1_to_p2(&s,&r);
1868         ge_p2_dbl(&r,&s); ge_p1p1_to_p2(&s,&r);
1869         ge_p2_dbl(&r,&s); ge_p1p1_to_p2(&s,&r);
1870         ge_p2_dbl(&r,&s); ge_p1p1_to_p3(h,&r);
1871
1872         for (i = 0; i < 64; i += 2)
1873         {
1874                 ge_select(&t,i / 2,e[i]);
1875                 ge_madd(&r,h,&t);
1876                 ge_p1p1_to_p3(h,&r);
1877         }
1878 }
1879
1880 /**
1881  * Input:
1882  * a[0]+256*a[1]+...+256^31*a[31] = a
1883  * b[0]+256*b[1]+...+256^31*b[31] = b
1884  * c[0]+256*c[1]+...+256^31*c[31] = c
1885  *
1886  * Output:
1887  * s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1888  * where l = 2^252 + 27742317777372353535851937790883648493.
1889  */
1890 void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, const uint8_t *c)
1891 {
1892         int64_t a0  = 2097151 & load_3(a);
1893         int64_t a1  = 2097151 & (load_4(a + 2) >> 5);
1894         int64_t a2  = 2097151 & (load_3(a + 5) >> 2);
1895         int64_t a3  = 2097151 & (load_4(a + 7) >> 7);
1896         int64_t a4  = 2097151 & (load_4(a + 10) >> 4);
1897         int64_t a5  = 2097151 & (load_3(a + 13) >> 1);
1898         int64_t a6  = 2097151 & (load_4(a + 15) >> 6);
1899         int64_t a7  = 2097151 & (load_3(a + 18) >> 3);
1900         int64_t a8  = 2097151 & load_3(a + 21);
1901         int64_t a9  = 2097151 & (load_4(a + 23) >> 5);
1902         int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1903         int64_t a11 = (load_4(a + 28) >> 7);
1904
1905         int64_t b0  = 2097151 & load_3(b);
1906         int64_t b1  = 2097151 & (load_4(b + 2) >> 5);
1907         int64_t b2  = 2097151 & (load_3(b + 5) >> 2);
1908         int64_t b3  = 2097151 & (load_4(b + 7) >> 7);
1909         int64_t b4  = 2097151 & (load_4(b + 10) >> 4);
1910         int64_t b5  = 2097151 & (load_3(b + 13) >> 1);
1911         int64_t b6  = 2097151 & (load_4(b + 15) >> 6);
1912         int64_t b7  = 2097151 & (load_3(b + 18) >> 3);
1913         int64_t b8  = 2097151 & load_3(b + 21);
1914         int64_t b9  = 2097151 & (load_4(b + 23) >> 5);
1915         int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1916         int64_t b11 = (load_4(b + 28) >> 7);
1917
1918         int64_t c0  = 2097151 & load_3(c);
1919         int64_t c1  = 2097151 & (load_4(c + 2) >> 5);
1920         int64_t c2  = 2097151 & (load_3(c + 5) >> 2);
1921         int64_t c3  = 2097151 & (load_4(c + 7) >> 7);
1922         int64_t c4  = 2097151 & (load_4(c + 10) >> 4);
1923         int64_t c5  = 2097151 & (load_3(c + 13) >> 1);
1924         int64_t c6  = 2097151 & (load_4(c + 15) >> 6);
1925         int64_t c7  = 2097151 & (load_3(c + 18) >> 3);
1926         int64_t c8  = 2097151 & load_3(c + 21);
1927         int64_t c9  = 2097151 & (load_4(c + 23) >> 5);
1928         int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1929         int64_t c11 = (load_4(c + 28) >> 7);
1930
1931         int64_t s0,  s1,  s2,  s3,  s4,  s5,  s6,  s7,  s8,  s9,  s10, s11;
1932         int64_t s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23;
1933
1934         int64_t carry0,  carry1,  carry2,   carry3, carry4,  carry5,  carry6;
1935         int64_t carry7,  carry8,  carry9,  carry10, carry11, carry12, carry13;
1936         int64_t carry14, carry15, carry16, carry17, carry18, carry19, carry20;
1937         int64_t carry21, carry22;
1938
1939         s0 = c0 + a0*b0;
1940         s1 = c1 + a0*b1 + a1*b0;
1941         s2 = c2 + a0*b2 + a1*b1 + a2*b0;
1942         s3 = c3 + a0*b3 + a1*b2 + a2*b1 + a3*b0;
1943         s4 = c4 + a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0;
1944         s5 = c5 + a0*b5 + a1*b4 + a2*b3 + a3*b2 + a4*b1 + a5*b0;
1945         s6 = c6 + a0*b6 + a1*b5 + a2*b4 + a3*b3 + a4*b2 + a5*b1 + a6*b0;
1946         s7 = c7 + a0*b7 + a1*b6 + a2*b5 + a3*b4 + a4*b3 + a5*b2 + a6*b1 + a7*b0;
1947         s8 = c8 + a0*b8 + a1*b7 + a2*b6 + a3*b5 + a4*b4 + a5*b3 + a6*b2 + a7*b1 + a8*b0;
1948         s9 = c9 + a0*b9 + a1*b8 + a2*b7 + a3*b6 + a4*b5 + a5*b4 + a6*b3 + a7*b2 + a8*b1 + a9*b0;
1949         s10 = c10 + a0*b10 + a1*b9 + a2*b8 + a3*b7 + a4*b6 + a5*b5 + a6*b4 + a7*b3 + a8*b2 + a9*b1 + a10*b0;
1950         s11 = c11 + a0*b11 + a1*b10 + a2*b9 + a3*b8 + a4*b7 + a5*b6 + a6*b5 + a7*b4 + a8*b3 + a9*b2 + a10*b1 + a11*b0;
1951         s12 = a1*b11 + a2*b10 + a3*b9 + a4*b8 + a5*b7 + a6*b6 + a7*b5 + a8*b4 + a9*b3 + a10*b2 + a11*b1;
1952         s13 = a2*b11 + a3*b10 + a4*b9 + a5*b8 + a6*b7 + a7*b6 + a8*b5 + a9*b4 + a10*b3 + a11*b2;
1953         s14 = a3*b11 + a4*b10 + a5*b9 + a6*b8 + a7*b7 + a8*b6 + a9*b5 + a10*b4 + a11*b3;
1954         s15 = a4*b11 + a5*b10 + a6*b9 + a7*b8 + a8*b7 + a9*b6 + a10*b5 + a11*b4;
1955         s16 = a5*b11 + a6*b10 + a7*b9 + a8*b8 + a9*b7 + a10*b6 + a11*b5;
1956         s17 = a6*b11 + a7*b10 + a8*b9 + a9*b8 + a10*b7 + a11*b6;
1957         s18 = a7*b11 + a8*b10 + a9*b9 + a10*b8 + a11*b7;
1958         s19 = a8*b11 + a9*b10 + a10*b9 + a11*b8;
1959         s20 = a9*b11 + a10*b10 + a11*b9;
1960         s21 = a10*b11 + a11*b10;
1961         s22 = a11*b11;
1962         s23 = 0;
1963
1964         carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
1965         s1 += carry0;
1966         s0 -= carry0 * ((uint64_t) 1L << 21);
1967
1968         carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
1969         s3 += carry2;
1970         s2 -= carry2 * ((uint64_t) 1L << 21);
1971
1972         carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
1973         s5 += carry4;
1974         s4 -= carry4 * ((uint64_t) 1L << 21);
1975
1976         carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1977         s7 += carry6;
1978         s6 -= carry6 * ((uint64_t) 1L << 21);
1979
1980         carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1981         s9 += carry8;
1982         s8 -= carry8 * ((uint64_t) 1L << 21);
1983
1984         carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1985         s11 += carry10;
1986         s10 -= carry10 * ((uint64_t) 1L << 21);
1987
1988         carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
1989         s13 += carry12;
1990         s12 -= carry12 * ((uint64_t) 1L << 21);
1991
1992         carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
1993         s15 += carry14;
1994         s14 -= carry14 * ((uint64_t) 1L << 21);
1995
1996         carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
1997         s17 += carry16;
1998         s16 -= carry16 * ((uint64_t) 1L << 21);
1999
2000         carry18 = (s18 + (int64_t) (1L << 20)) >> 21;
2001         s19 += carry18;
2002         s18 -= carry18 * ((uint64_t) 1L << 21);
2003
2004         carry20 = (s20 + (int64_t) (1L << 20)) >> 21;
2005         s21 += carry20;
2006         s20 -= carry20 * ((uint64_t) 1L << 21);
2007
2008         carry22 = (s22 + (int64_t) (1L << 20)) >> 21;
2009         s23 += carry22;
2010         s22 -= carry22 * ((uint64_t) 1L << 21);
2011
2012         carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
2013         s2 += carry1;
2014         s1 -= carry1 * ((uint64_t) 1L << 21);
2015
2016         carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
2017         s4 += carry3;
2018         s3 -= carry3 * ((uint64_t) 1L << 21);
2019
2020         carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
2021         s6 += carry5;
2022         s5 -= carry5 * ((uint64_t) 1L << 21);
2023
2024         carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2025         s8 += carry7;
2026         s7 -= carry7 * ((uint64_t) 1L << 21);
2027
2028         carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2029         s10 += carry9;
2030         s9 -= carry9 * ((uint64_t) 1L << 21);
2031
2032         carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2033         s12 += carry11;
2034         s11 -= carry11 * ((uint64_t) 1L << 21);
2035
2036         carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
2037         s14 += carry13;
2038         s13 -= carry13 * ((uint64_t) 1L << 21);
2039
2040         carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
2041         s16 += carry15;
2042         s15 -= carry15 * ((uint64_t) 1L << 21);
2043
2044         carry17 = (s17 + (int64_t) (1L << 20)) >> 21;
2045         s18 += carry17;
2046         s17 -= carry17 * ((uint64_t) 1L << 21);
2047
2048         carry19 = (s19 + (int64_t) (1L << 20)) >> 21;
2049         s20 += carry19;
2050         s19 -= carry19 * ((uint64_t) 1L << 21);
2051
2052         carry21 = (s21 + (int64_t) (1L << 20)) >> 21;
2053         s22 += carry21;
2054         s21 -= carry21 * ((uint64_t) 1L << 21);
2055
2056         s11 += s23 * 666643;
2057         s12 += s23 * 470296;
2058         s13 += s23 * 654183;
2059         s14 -= s23 * 997805;
2060         s15 += s23 * 136657;
2061         s16 -= s23 * 683901;
2062
2063         s10 += s22 * 666643;
2064         s11 += s22 * 470296;
2065         s12 += s22 * 654183;
2066         s13 -= s22 * 997805;
2067         s14 += s22 * 136657;
2068         s15 -= s22 * 683901;
2069
2070         s9  += s21 * 666643;
2071         s10 += s21 * 470296;
2072         s11 += s21 * 654183;
2073         s12 -= s21 * 997805;
2074         s13 += s21 * 136657;
2075         s14 -= s21 * 683901;
2076
2077         s8  += s20 * 666643;
2078         s9  += s20 * 470296;
2079         s10 += s20 * 654183;
2080         s11 -= s20 * 997805;
2081         s12 += s20 * 136657;
2082         s13 -= s20 * 683901;
2083
2084         s7  += s19 * 666643;
2085         s8  += s19 * 470296;
2086         s9  += s19 * 654183;
2087         s10 -= s19 * 997805;
2088         s11 += s19 * 136657;
2089         s12 -= s19 * 683901;
2090
2091         s6  += s18 * 666643;
2092         s7  += s18 * 470296;
2093         s8  += s18 * 654183;
2094         s9  -= s18 * 997805;
2095         s10 += s18 * 136657;
2096         s11 -= s18 * 683901;
2097
2098         carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2099         s7 += carry6;
2100         s6 -= carry6 * ((uint64_t) 1L << 21);
2101
2102         carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2103         s9 += carry8;
2104         s8 -= carry8 * ((uint64_t) 1L << 21);
2105
2106         carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2107         s11 += carry10;
2108         s10 -= carry10 * ((uint64_t) 1L << 21);
2109
2110         carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
2111         s13 += carry12;
2112         s12 -= carry12 * ((uint64_t) 1L << 21);
2113
2114         carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
2115         s15 += carry14;
2116         s14 -= carry14 * ((uint64_t) 1L << 21);
2117
2118         carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
2119         s17 += carry16;
2120         s16 -= carry16 * ((uint64_t) 1L << 21);
2121
2122         carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2123         s8 += carry7;
2124         s7 -= carry7 * ((uint64_t) 1L << 21);
2125
2126         carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2127         s10 += carry9;
2128         s9 -= carry9 * ((uint64_t) 1L << 21);
2129
2130         carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2131         s12 += carry11;
2132         s11 -= carry11 * ((uint64_t) 1L << 21);
2133
2134         carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
2135         s14 += carry13;
2136         s13 -= carry13 * ((uint64_t) 1L << 21);
2137
2138         carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
2139         s16 += carry15;
2140         s15 -= carry15 * ((uint64_t) 1L << 21);
2141
2142         s5  += s17 * 666643;
2143         s6  += s17 * 470296;
2144         s7  += s17 * 654183;
2145         s8  -= s17 * 997805;
2146         s9  += s17 * 136657;
2147         s10 -= s17 * 683901;
2148
2149         s4  += s16 * 666643;
2150         s5  += s16 * 470296;
2151         s6  += s16 * 654183;
2152         s7  -= s16 * 997805;
2153         s8  += s16 * 136657;
2154         s9  -= s16 * 683901;
2155
2156         s3  += s15 * 666643;
2157         s4  += s15 * 470296;
2158         s5  += s15 * 654183;
2159         s6  -= s15 * 997805;
2160         s7  += s15 * 136657;
2161         s8  -= s15 * 683901;
2162
2163         s2  += s14 * 666643;
2164         s3  += s14 * 470296;
2165         s4  += s14 * 654183;
2166         s5  -= s14 * 997805;
2167         s6  += s14 * 136657;
2168         s7  -= s14 * 683901;
2169
2170         s1  += s13 * 666643;
2171         s2  += s13 * 470296;
2172         s3  += s13 * 654183;
2173         s4  -= s13 * 997805;
2174         s5  += s13 * 136657;
2175         s6  -= s13 * 683901;
2176
2177         s0  += s12 * 666643;
2178         s1  += s12 * 470296;
2179         s2  += s12 * 654183;
2180         s3  -= s12 * 997805;
2181         s4  += s12 * 136657;
2182         s5  -= s12 * 683901;
2183         s12 = 0;
2184
2185         carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
2186         s1 += carry0;
2187         s0 -= carry0 * ((uint64_t) 1L << 21);
2188
2189         carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
2190         s3 += carry2;
2191         s2 -= carry2 * ((uint64_t) 1L << 21);
2192
2193         carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
2194         s5 += carry4;
2195         s4 -= carry4 * ((uint64_t) 1L << 21);
2196
2197         carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2198         s7 += carry6;
2199         s6 -= carry6 * ((uint64_t) 1L << 21);
2200
2201         carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2202         s9 += carry8;
2203         s8 -= carry8 * ((uint64_t) 1L << 21);
2204
2205         carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2206         s11 += carry10;
2207         s10 -= carry10 * ((uint64_t) 1L << 21);
2208
2209         carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
2210         s2 += carry1;
2211         s1 -= carry1 * ((uint64_t) 1L << 21);
2212
2213         carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
2214         s4 += carry3;
2215         s3 -= carry3 * ((uint64_t) 1L << 21);
2216
2217         carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
2218         s6 += carry5;
2219         s5 -= carry5 * ((uint64_t) 1L << 21);
2220
2221         carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2222         s8 += carry7;
2223         s7 -= carry7 * ((uint64_t) 1L << 21);
2224
2225         carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2226         s10 += carry9;
2227         s9 -= carry9 * ((uint64_t) 1L << 21);
2228
2229         carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2230         s12 += carry11;
2231         s11 -= carry11 * ((uint64_t) 1L << 21);
2232
2233         s0 += s12 * 666643;
2234         s1 += s12 * 470296;
2235         s2 += s12 * 654183;
2236         s3 -= s12 * 997805;
2237         s4 += s12 * 136657;
2238         s5 -= s12 * 683901;
2239         s12 = 0;
2240
2241         carry0 = s0 >> 21;
2242         s1 += carry0;
2243         s0 -= carry0 * ((uint64_t) 1L << 21);
2244
2245         carry1 = s1 >> 21;
2246         s2 += carry1;
2247         s1 -= carry1 * ((uint64_t) 1L << 21);
2248
2249         carry2 = s2 >> 21;
2250         s3 += carry2;
2251         s2 -= carry2 * ((uint64_t) 1L << 21);
2252
2253         carry3 = s3 >> 21;
2254         s4 += carry3;
2255         s3 -= carry3 * ((uint64_t) 1L << 21);
2256
2257         carry4 = s4 >> 21;
2258         s5 += carry4;
2259         s4 -= carry4 * ((uint64_t) 1L << 21);
2260
2261         carry5 = s5 >> 21;
2262         s6 += carry5;
2263         s5 -= carry5 * ((uint64_t) 1L << 21);
2264
2265         carry6 = s6 >> 21;
2266         s7 += carry6;
2267         s6 -= carry6 * ((uint64_t) 1L << 21);
2268
2269         carry7 = s7 >> 21;
2270         s8 += carry7;
2271         s7 -= carry7 * ((uint64_t) 1L << 21);
2272
2273         carry8 = s8 >> 21;
2274         s9 += carry8;
2275         s8 -= carry8 * ((uint64_t) 1L << 21);
2276
2277         carry9 = s9 >> 21;
2278         s10 += carry9;
2279         s9 -= carry9 * ((uint64_t) 1L << 21);
2280
2281         carry10 = s10 >> 21;
2282         s11 += carry10;
2283         s10 -= carry10 * ((uint64_t) 1L << 21);
2284
2285         carry11 = s11 >> 21;
2286         s12 += carry11;
2287         s11 -= carry11 * ((uint64_t) 1L << 21);
2288
2289         s0 += s12 * 666643;
2290         s1 += s12 * 470296;
2291         s2 += s12 * 654183;
2292         s3 -= s12 * 997805;
2293         s4 += s12 * 136657;
2294         s5 -= s12 * 683901;
2295
2296         carry0 = s0 >> 21;
2297         s1 += carry0;
2298         s0 -= carry0 * ((uint64_t) 1L << 21);
2299
2300         carry1 = s1 >> 21;
2301         s2 += carry1;
2302         s1 -= carry1 * ((uint64_t) 1L << 21);
2303
2304         carry2 = s2 >> 21;
2305         s3 += carry2;
2306         s2 -= carry2 * ((uint64_t) 1L << 21);
2307
2308         carry3 = s3 >> 21;
2309         s4 += carry3;
2310         s3 -= carry3 * ((uint64_t) 1L << 21);
2311
2312         carry4 = s4 >> 21;
2313         s5 += carry4;
2314         s4 -= carry4 * ((uint64_t) 1L << 21);
2315
2316         carry5 = s5 >> 21;
2317         s6 += carry5;
2318         s5 -= carry5 * ((uint64_t) 1L << 21);
2319
2320         carry6 = s6 >> 21;
2321         s7 += carry6;
2322         s6 -= carry6 * ((uint64_t) 1L << 21);
2323
2324         carry7 = s7 >> 21;
2325         s8 += carry7;
2326         s7 -= carry7 * ((uint64_t) 1L << 21);
2327
2328         carry8 = s8 >> 21;
2329         s9 += carry8;
2330         s8 -= carry8 * ((uint64_t) 1L << 21);
2331
2332         carry9 = s9 >> 21;
2333         s10 += carry9;
2334         s9 -= carry9 * ((uint64_t) 1L << 21);
2335
2336         carry10 = s10 >> 21;
2337         s11 += carry10;
2338         s10 -= carry10 * ((uint64_t) 1L << 21);
2339
2340         s[0]  = s0 >> 0;
2341         s[1]  = s0 >> 8;
2342         s[2]  = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
2343         s[3]  = s1 >> 3;
2344         s[4]  = s1 >> 11;
2345         s[5]  = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
2346         s[6]  = s2 >> 6;
2347         s[7]  = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
2348         s[8]  = s3 >> 1;
2349         s[9]  = s3 >> 9;
2350         s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
2351         s[11] = s4 >> 4;
2352         s[12] = s4 >> 12;
2353         s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
2354         s[14] = s5 >> 7;
2355         s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
2356         s[16] = s6 >> 2;
2357         s[17] = s6 >> 10;
2358         s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
2359         s[19] = s7 >> 5;
2360         s[20] = s7 >> 13;
2361         s[21] = s8 >> 0;
2362         s[22] = s8 >> 8;
2363         s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
2364         s[24] = s9 >> 3;
2365         s[25] = s9 >> 11;
2366         s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
2367         s[27] = s10 >> 6;
2368         s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
2369         s[29] = s11 >> 1;
2370         s[30] = s11 >> 9;
2371         s[31] = s11 >> 17;
2372 }
2373
2374 /**
2375  * Input:
2376  * s[0]+256*s[1]+...+256^63*s[63] = s
2377  *
2378  * Output:
2379  * s[0]+256*s[1]+...+256^31*s[31] = s mod l
2380  * where l = 2^252 + 27742317777372353535851937790883648493.
2381  * Overwrites s in place.
2382  */
2383 void sc_reduce(uint8_t *s)
2384 {
2385         int64_t s0  = 2097151 & load_3(s);
2386         int64_t s1  = 2097151 & (load_4(s + 2) >> 5);
2387         int64_t s2  = 2097151 & (load_3(s + 5) >> 2);
2388         int64_t s3  = 2097151 & (load_4(s + 7) >> 7);
2389         int64_t s4  = 2097151 & (load_4(s + 10) >> 4);
2390         int64_t s5  = 2097151 & (load_3(s + 13) >> 1);
2391         int64_t s6  = 2097151 & (load_4(s + 15) >> 6);
2392         int64_t s7  = 2097151 & (load_3(s + 18) >> 3);
2393         int64_t s8  = 2097151 & load_3(s + 21);
2394         int64_t s9  = 2097151 & (load_4(s + 23) >> 5);
2395         int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
2396         int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
2397         int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
2398         int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
2399         int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
2400         int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
2401         int64_t s16 = 2097151 & load_3(s + 42);
2402         int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
2403         int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
2404         int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
2405         int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
2406         int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
2407         int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
2408         int64_t s23 = (load_4(s + 60) >> 3);
2409
2410         int64_t carry0,  carry1,  carry2,   carry3,  carry4,  carry5,  carry6;
2411         int64_t carry7,  carry8,  carry9,  carry10, carry11, carry12, carry13;
2412         int64_t carry14, carry15, carry16;
2413
2414         s11 += s23 * 666643;
2415         s12 += s23 * 470296;
2416         s13 += s23 * 654183;
2417         s14 -= s23 * 997805;
2418         s15 += s23 * 136657;
2419         s16 -= s23 * 683901;
2420
2421         s10 += s22 * 666643;
2422         s11 += s22 * 470296;
2423         s12 += s22 * 654183;
2424         s13 -= s22 * 997805;
2425         s14 += s22 * 136657;
2426         s15 -= s22 * 683901;
2427
2428         s9  += s21 * 666643;
2429         s10 += s21 * 470296;
2430         s11 += s21 * 654183;
2431         s12 -= s21 * 997805;
2432         s13 += s21 * 136657;
2433         s14 -= s21 * 683901;
2434
2435         s8  += s20 * 666643;
2436         s9  += s20 * 470296;
2437         s10 += s20 * 654183;
2438         s11 -= s20 * 997805;
2439         s12 += s20 * 136657;
2440         s13 -= s20 * 683901;
2441
2442         s7  += s19 * 666643;
2443         s8  += s19 * 470296;
2444         s9  += s19 * 654183;
2445         s10 -= s19 * 997805;
2446         s11 += s19 * 136657;
2447         s12 -= s19 * 683901;
2448
2449         s6  += s18 * 666643;
2450         s7  += s18 * 470296;
2451         s8  += s18 * 654183;
2452         s9  -= s18 * 997805;
2453         s10 += s18 * 136657;
2454         s11 -= s18 * 683901;
2455
2456         carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2457         s7 += carry6;
2458         s6 -= carry6 * ((uint64_t) 1L << 21);
2459
2460         carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2461         s9 += carry8;
2462         s8 -= carry8 * ((uint64_t) 1L << 21);
2463
2464         carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2465         s11 += carry10;
2466         s10 -= carry10 * ((uint64_t) 1L << 21);
2467
2468         carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
2469         s13 += carry12;
2470         s12 -= carry12 * ((uint64_t) 1L << 21);
2471
2472         carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
2473         s15 += carry14;
2474         s14 -= carry14 * ((uint64_t) 1L << 21);
2475
2476         carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
2477         s17 += carry16;
2478         s16 -= carry16 * ((uint64_t) 1L << 21);
2479
2480         carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2481         s8 += carry7;
2482         s7 -= carry7 * ((uint64_t) 1L << 21);
2483
2484         carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2485         s10 += carry9;
2486         s9 -= carry9 * ((uint64_t) 1L << 21);
2487
2488         carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2489         s12 += carry11;
2490         s11 -= carry11 * ((uint64_t) 1L << 21);
2491
2492         carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
2493         s14 += carry13;
2494         s13 -= carry13 * ((uint64_t) 1L << 21);
2495
2496         carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
2497         s16 += carry15;
2498         s15 -= carry15 * ((uint64_t) 1L << 21);
2499
2500         s5  += s17 * 666643;
2501         s6  += s17 * 470296;
2502         s7  += s17 * 654183;
2503         s8  -= s17 * 997805;
2504         s9  += s17 * 136657;
2505         s10 -= s17 * 683901;
2506
2507         s4  += s16 * 666643;
2508         s5  += s16 * 470296;
2509         s6  += s16 * 654183;
2510         s7  -= s16 * 997805;
2511         s8  += s16 * 136657;
2512         s9  -= s16 * 683901;
2513
2514         s3  += s15 * 666643;
2515         s4  += s15 * 470296;
2516         s5  += s15 * 654183;
2517         s6  -= s15 * 997805;
2518         s7  += s15 * 136657;
2519         s8  -= s15 * 683901;
2520
2521         s2  += s14 * 666643;
2522         s3  += s14 * 470296;
2523         s4  += s14 * 654183;
2524         s5  -= s14 * 997805;
2525         s6  += s14 * 136657;
2526         s7  -= s14 * 683901;
2527
2528         s1  += s13 * 666643;
2529         s2  += s13 * 470296;
2530         s3  += s13 * 654183;
2531         s4  -= s13 * 997805;
2532         s5  += s13 * 136657;
2533         s6  -= s13 * 683901;
2534
2535         s0  += s12 * 666643;
2536         s1  += s12 * 470296;
2537         s2  += s12 * 654183;
2538         s3  -= s12 * 997805;
2539         s4  += s12 * 136657;
2540         s5  -= s12 * 683901;
2541         s12 = 0;
2542
2543         carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
2544         s1 += carry0;
2545         s0 -= carry0 * ((uint64_t) 1L << 21);
2546
2547         carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
2548         s3 += carry2;
2549         s2 -= carry2 * ((uint64_t) 1L << 21);
2550
2551         carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
2552         s5 += carry4;
2553         s4 -= carry4 * ((uint64_t) 1L << 21);
2554
2555         carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2556         s7 += carry6;
2557         s6 -= carry6 * ((uint64_t) 1L << 21);
2558
2559         carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2560         s9 += carry8;
2561         s8 -= carry8 * ((uint64_t) 1L << 21);
2562
2563         carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2564         s11 += carry10;
2565         s10 -= carry10 * ((uint64_t) 1L << 21);
2566
2567         carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
2568         s2 += carry1;
2569         s1 -= carry1 * ((uint64_t) 1L << 21);
2570
2571         carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
2572         s4 += carry3;
2573         s3 -= carry3 * ((uint64_t) 1L << 21);
2574
2575         carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
2576         s6 += carry5;
2577         s5 -= carry5 * ((uint64_t) 1L << 21);
2578
2579         carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2580         s8 += carry7;
2581         s7 -= carry7 * ((uint64_t) 1L << 21);
2582
2583         carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2584         s10 += carry9;
2585         s9 -= carry9 * ((uint64_t) 1L << 21);
2586
2587         carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2588         s12 += carry11;
2589         s11 -= carry11 * ((uint64_t) 1L << 21);
2590
2591         s0 += s12 * 666643;
2592         s1 += s12 * 470296;
2593         s2 += s12 * 654183;
2594         s3 -= s12 * 997805;
2595         s4 += s12 * 136657;
2596         s5 -= s12 * 683901;
2597         s12 = 0;
2598
2599         carry0 = s0 >> 21;
2600         s1 += carry0;
2601         s0 -= carry0 * ((uint64_t) 1L << 21);
2602
2603         carry1 = s1 >> 21;
2604         s2 += carry1;
2605         s1 -= carry1 * ((uint64_t) 1L << 21);
2606
2607         carry2 = s2 >> 21;
2608         s3 += carry2;
2609         s2 -= carry2 * ((uint64_t) 1L << 21);
2610
2611         carry3 = s3 >> 21;
2612         s4 += carry3;
2613         s3 -= carry3 * ((uint64_t) 1L << 21);
2614
2615         carry4 = s4 >> 21;
2616         s5 += carry4;
2617         s4 -= carry4 * ((uint64_t) 1L << 21);
2618
2619         carry5 = s5 >> 21;
2620         s6 += carry5;
2621         s5 -= carry5 * ((uint64_t) 1L << 21);
2622
2623         carry6 = s6 >> 21;
2624         s7 += carry6;
2625         s6 -= carry6 * ((uint64_t) 1L << 21);
2626
2627         carry7 = s7 >> 21;
2628         s8 += carry7;
2629         s7 -= carry7 * ((uint64_t) 1L << 21);
2630
2631         carry8 = s8 >> 21;
2632         s9 += carry8;
2633         s8 -= carry8 * ((uint64_t) 1L << 21);
2634
2635         carry9 = s9 >> 21;
2636         s10 += carry9;
2637         s9 -= carry9 * ((uint64_t) 1L << 21);
2638
2639         carry10 = s10 >> 21;
2640         s11 += carry10;
2641         s10 -= carry10 * ((uint64_t) 1L << 21);
2642
2643         carry11 = s11 >> 21;
2644         s12 += carry11;
2645         s11 -= carry11 * ((uint64_t) 1L << 21);
2646
2647         s0 += s12 * 666643;
2648         s1 += s12 * 470296;
2649         s2 += s12 * 654183;
2650         s3 -= s12 * 997805;
2651         s4 += s12 * 136657;
2652         s5 -= s12 * 683901;
2653
2654         carry0 = s0 >> 21;
2655         s1 += carry0;
2656         s0 -= carry0 * ((uint64_t) 1L << 21);
2657
2658         carry1 = s1 >> 21;
2659         s2 += carry1;
2660         s1 -= carry1 * ((uint64_t) 1L << 21);
2661
2662         carry2 = s2 >> 21;
2663         s3 += carry2;
2664         s2 -= carry2 * ((uint64_t) 1L << 21);
2665
2666         carry3 = s3 >> 21;
2667         s4 += carry3;
2668         s3 -= carry3 * ((uint64_t) 1L << 21);
2669
2670         carry4 = s4 >> 21;
2671         s5 += carry4;
2672         s4 -= carry4 * ((uint64_t) 1L << 21);
2673
2674         carry5 = s5 >> 21;
2675         s6 += carry5;
2676         s5 -= carry5 * ((uint64_t) 1L << 21);
2677
2678         carry6 = s6 >> 21;
2679         s7 += carry6;
2680         s6 -= carry6 * ((uint64_t) 1L << 21);
2681
2682         carry7 = s7 >> 21;
2683         s8 += carry7;
2684         s7 -= carry7 * ((uint64_t) 1L << 21);
2685
2686         carry8 = s8 >> 21;
2687         s9 += carry8;
2688         s8 -= carry8 * ((uint64_t) 1L << 21);
2689
2690         carry9 = s9 >> 21;
2691         s10 += carry9;
2692         s9 -= carry9 * ((uint64_t) 1L << 21);
2693
2694         carry10 = s10 >> 21;
2695         s11 += carry10;
2696         s10 -= carry10 * ((uint64_t) 1L << 21);
2697
2698         s[0]  = s0 >> 0;
2699         s[1]  = s0 >> 8;
2700         s[2]  = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
2701         s[3]  = s1 >> 3;
2702         s[4]  = s1 >> 11;
2703         s[5]  = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
2704         s[6]  = s2 >> 6;
2705         s[7]  = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
2706         s[8]  = s3 >> 1;
2707         s[9]  = s3 >> 9;
2708         s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
2709         s[11] = s4 >> 4;
2710         s[12] = s4 >> 12;
2711         s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
2712         s[14] = s5 >> 7;
2713         s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
2714         s[16] = s6 >> 2;
2715         s[17] = s6 >> 10;
2716         s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
2717         s[19] = s7 >> 5;
2718         s[20] = s7 >> 13;
2719         s[21] = s8 >> 0;
2720         s[22] = s8 >> 8;
2721         s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
2722         s[24] = s9 >> 3;
2723         s[25] = s9 >> 11;
2724         s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
2725         s[27] = s10 >> 6;
2726         s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
2727         s[29] = s11 >> 1;
2728         s[30] = s11 >> 9;
2729         s[31] = s11 >> 17;
2730 }