]> git.ipfire.org Git - thirdparty/strongswan.git/blob - src/libstrongswan/plugins/curve25519/ref10/ref10.c
Update copyright headers after acquisition by secunet
[thirdparty/strongswan.git] / src / libstrongswan / plugins / curve25519 / ref10 / ref10.c
1 /*
2 * Copyright (C) 2016 Andreas Steffen
3 *
4 * Based on the public domain libsodium adaptation by Frank Denis
5 * of the SUPERCOP ref10 implementation by Daniel J. Bernstein,
6 * Niels Duif, Peter Schwabe, Tanja Lange and Bo-Yin Yang.
7 */
8
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <string.h>
12
13 #include "ref10.h"
14
15 #include <utils/utils.h>
16
17 static uint64_t load_3(const uint8_t *in)
18 {
19 uint64_t result;
20
21 result = (uint64_t) in[0];
22 result |= ((uint64_t) in[1]) << 8;
23 result |= ((uint64_t) in[2]) << 16;
24
25 return result;
26 }
27
28 static uint64_t load_4(const uint8_t *in)
29 {
30 uint64_t result;
31
32 result = (uint64_t) in[0];
33 result |= ((uint64_t) in[1]) << 8;
34 result |= ((uint64_t) in[2]) << 16;
35 result |= ((uint64_t) in[3]) << 24;
36
37 return result;
38 }
39
40 /**
41 * h = 0
42 */
43 static void fe_0(fe h)
44 {
45 memset(&h[0], 0, 10 * sizeof h[0]);
46 }
47
48 /**
49 * h = 1
50 */
51 static void fe_1(fe h)
52 {
53 h[0] = 1;
54 h[1] = 0;
55 memset(&h[2], 0, 8 * sizeof h[0]);
56 }
57
58 /**
59 * h = f + g
60 * Can overlap h with f or g.
61 *
62 * Preconditions:
63 * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
64 * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
65 *
66 * Postconditions:
67 * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
68 */
69 static void fe_add(fe h, const fe f, const fe g)
70 {
71 int32_t f0 = f[0];
72 int32_t f1 = f[1];
73 int32_t f2 = f[2];
74 int32_t f3 = f[3];
75 int32_t f4 = f[4];
76 int32_t f5 = f[5];
77 int32_t f6 = f[6];
78 int32_t f7 = f[7];
79 int32_t f8 = f[8];
80 int32_t f9 = f[9];
81
82 int32_t g0 = g[0];
83 int32_t g1 = g[1];
84 int32_t g2 = g[2];
85 int32_t g3 = g[3];
86 int32_t g4 = g[4];
87 int32_t g5 = g[5];
88 int32_t g6 = g[6];
89 int32_t g7 = g[7];
90 int32_t g8 = g[8];
91 int32_t g9 = g[9];
92
93 int32_t h0 = f0 + g0;
94 int32_t h1 = f1 + g1;
95 int32_t h2 = f2 + g2;
96 int32_t h3 = f3 + g3;
97 int32_t h4 = f4 + g4;
98 int32_t h5 = f5 + g5;
99 int32_t h6 = f6 + g6;
100 int32_t h7 = f7 + g7;
101 int32_t h8 = f8 + g8;
102 int32_t h9 = f9 + g9;
103
104 h[0] = h0;
105 h[1] = h1;
106 h[2] = h2;
107 h[3] = h3;
108 h[4] = h4;
109 h[5] = h5;
110 h[6] = h6;
111 h[7] = h7;
112 h[8] = h8;
113 h[9] = h9;
114 }
115
116 /**
117 * Replace (f,g) with (g,g) if b == 1;
118 * replace (f,g) with (f,g) if b == 0.
119 *
120 * Preconditions: b in {0,1}.
121 */
122 static void fe_cmov(fe f, const fe g, unsigned int b)
123 {
124 int32_t f0 = f[0];
125 int32_t f1 = f[1];
126 int32_t f2 = f[2];
127 int32_t f3 = f[3];
128 int32_t f4 = f[4];
129 int32_t f5 = f[5];
130 int32_t f6 = f[6];
131 int32_t f7 = f[7];
132 int32_t f8 = f[8];
133 int32_t f9 = f[9];
134
135 int32_t g0 = g[0];
136 int32_t g1 = g[1];
137 int32_t g2 = g[2];
138 int32_t g3 = g[3];
139 int32_t g4 = g[4];
140 int32_t g5 = g[5];
141 int32_t g6 = g[6];
142 int32_t g7 = g[7];
143 int32_t g8 = g[8];
144 int32_t g9 = g[9];
145
146 int32_t x0 = f0 ^ g0;
147 int32_t x1 = f1 ^ g1;
148 int32_t x2 = f2 ^ g2;
149 int32_t x3 = f3 ^ g3;
150 int32_t x4 = f4 ^ g4;
151 int32_t x5 = f5 ^ g5;
152 int32_t x6 = f6 ^ g6;
153 int32_t x7 = f7 ^ g7;
154 int32_t x8 = f8 ^ g8;
155 int32_t x9 = f9 ^ g9;
156
157 b = (unsigned int) (- (int) b);
158
159 x0 &= b;
160 x1 &= b;
161 x2 &= b;
162 x3 &= b;
163 x4 &= b;
164 x5 &= b;
165 x6 &= b;
166 x7 &= b;
167 x8 &= b;
168 x9 &= b;
169
170 f[0] = f0 ^ x0;
171 f[1] = f1 ^ x1;
172 f[2] = f2 ^ x2;
173 f[3] = f3 ^ x3;
174 f[4] = f4 ^ x4;
175 f[5] = f5 ^ x5;
176 f[6] = f6 ^ x6;
177 f[7] = f7 ^ x7;
178 f[8] = f8 ^ x8;
179 f[9] = f9 ^ x9;
180 }
181
182 /**
183 * h = f
184 */
185 static void fe_copy(fe h, const fe f)
186 {
187 int32_t f0 = f[0];
188 int32_t f1 = f[1];
189 int32_t f2 = f[2];
190 int32_t f3 = f[3];
191 int32_t f4 = f[4];
192 int32_t f5 = f[5];
193 int32_t f6 = f[6];
194 int32_t f7 = f[7];
195 int32_t f8 = f[8];
196 int32_t f9 = f[9];
197
198 h[0] = f0;
199 h[1] = f1;
200 h[2] = f2;
201 h[3] = f3;
202 h[4] = f4;
203 h[5] = f5;
204 h[6] = f6;
205 h[7] = f7;
206 h[8] = f8;
207 h[9] = f9;
208 }
209
210 /**
211 * Ignores top bit of h.
212 */
213 static void fe_frombytes(fe h, const uint8_t *s)
214 {
215 int64_t h0 = load_4(s);
216 int64_t h1 = load_3(s + 4) << 6;
217 int64_t h2 = load_3(s + 7) << 5;
218 int64_t h3 = load_3(s + 10) << 3;
219 int64_t h4 = load_3(s + 13) << 2;
220 int64_t h5 = load_4(s + 16);
221 int64_t h6 = load_3(s + 20) << 7;
222 int64_t h7 = load_3(s + 23) << 5;
223 int64_t h8 = load_3(s + 26) << 4;
224 int64_t h9 = (load_3(s + 29) & 8388607) << 2;
225
226 int64_t carry0, carry1, carry2, carry3, carry4;
227 int64_t carry5, carry6, carry7, carry8, carry9;
228
229 carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
230 h0 += carry9 * 19;
231 h9 -= carry9 * ((uint64_t) 1L << 25);
232
233 carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
234 h2 += carry1;
235 h1 -= carry1 * ((uint64_t) 1L << 25);
236
237 carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
238 h4 += carry3;
239 h3 -= carry3 * ((uint64_t) 1L << 25);
240
241 carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
242 h6 += carry5;
243 h5 -= carry5 * ((uint64_t) 1L << 25);
244
245 carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
246 h8 += carry7;
247 h7 -= carry7 * ((uint64_t) 1L << 25);
248
249 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
250 h1 += carry0;
251 h0 -= carry0 * ((uint64_t) 1L << 26);
252
253 carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
254 h3 += carry2;
255 h2 -= carry2 * ((uint64_t) 1L << 26);
256
257 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
258 h5 += carry4;
259 h4 -= carry4 * ((uint64_t) 1L << 26);
260
261 carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
262 h7 += carry6;
263 h6 -= carry6 * ((uint64_t) 1L << 26);
264
265 carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
266 h9 += carry8;
267 h8 -= carry8 * ((uint64_t) 1L << 26);
268
269 h[0] = (int32_t) h0;
270 h[1] = (int32_t) h1;
271 h[2] = (int32_t) h2;
272 h[3] = (int32_t) h3;
273 h[4] = (int32_t) h4;
274 h[5] = (int32_t) h5;
275 h[6] = (int32_t) h6;
276 h[7] = (int32_t) h7;
277 h[8] = (int32_t) h8;
278 h[9] = (int32_t) h9;
279 }
280
281 /**
282 * Preconditions:
283 * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
284 *
285 * Write p=2^255-19; q=floor(h/p).
286 * Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
287 *
288 * Proof:
289 * Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
290 * Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
291 *
292 * Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
293 * Then 0<y<1.
294 *
295 * Write r=h-pq.
296 * Have 0<=r<=p-1=2^255-20.
297 * Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
298 *
299 * Write x=r+19(2^-255)r+y.
300 * Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
301 *
302 * Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
303 * so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
304 */
305 static void fe_tobytes(uint8_t *s, const fe h)
306 {
307 int32_t h0 = h[0];
308 int32_t h1 = h[1];
309 int32_t h2 = h[2];
310 int32_t h3 = h[3];
311 int32_t h4 = h[4];
312 int32_t h5 = h[5];
313 int32_t h6 = h[6];
314 int32_t h7 = h[7];
315 int32_t h8 = h[8];
316 int32_t h9 = h[9];
317
318 int32_t carry0, carry1, carry2, carry3, carry4;
319 int32_t carry5, carry6, carry7, carry8, carry9;
320 int32_t q;
321
322 q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
323 q = (h0 + q) >> 26;
324 q = (h1 + q) >> 25;
325 q = (h2 + q) >> 26;
326 q = (h3 + q) >> 25;
327 q = (h4 + q) >> 26;
328 q = (h5 + q) >> 25;
329 q = (h6 + q) >> 26;
330 q = (h7 + q) >> 25;
331 q = (h8 + q) >> 26;
332 q = (h9 + q) >> 25;
333
334 /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
335 h0 += 19 * q;
336 /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
337
338 carry0 = h0 >> 26;
339 h1 += carry0;
340 h0 -= carry0 * ((uint32_t) 1L << 26);
341
342 carry1 = h1 >> 25;
343 h2 += carry1;
344 h1 -= carry1 * ((uint32_t) 1L << 25);
345
346 carry2 = h2 >> 26;
347 h3 += carry2;
348 h2 -= carry2 * ((uint32_t) 1L << 26);
349
350 carry3 = h3 >> 25;
351 h4 += carry3;
352 h3 -= carry3 * ((uint32_t) 1L << 25);
353
354 carry4 = h4 >> 26;
355 h5 += carry4;
356 h4 -= carry4 * ((uint32_t) 1L << 26);
357
358 carry5 = h5 >> 25;
359 h6 += carry5;
360 h5 -= carry5 * ((uint32_t) 1L << 25);
361
362 carry6 = h6 >> 26;
363 h7 += carry6;
364 h6 -= carry6 * ((uint32_t) 1L << 26);
365
366 carry7 = h7 >> 25;
367 h8 += carry7;
368 h7 -= carry7 * ((uint32_t) 1L << 25);
369
370 carry8 = h8 >> 26;
371 h9 += carry8;
372 h8 -= carry8 * ((uint32_t) 1L << 26);
373
374 carry9 = h9 >> 25;
375 h9 -= carry9 * ((uint32_t) 1L << 25);
376 /* h10 = carry9 */
377
378 /**
379 * Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
380 * Have h0+...+2^230 h9 between 0 and 2^255-1;
381 * evidently 2^255 h10-2^255 q = 0.
382 * Goal: Output h0+...+2^230 h9.
383 */
384 s[0] = h0 >> 0;
385 s[1] = h0 >> 8;
386 s[2] = h0 >> 16;
387 s[3] = (h0 >> 24) | (h1 * ((uint32_t) 1 << 2));
388 s[4] = h1 >> 6;
389 s[5] = h1 >> 14;
390 s[6] = (h1 >> 22) | (h2 * ((uint32_t) 1 << 3));
391 s[7] = h2 >> 5;
392 s[8] = h2 >> 13;
393 s[9] = (h2 >> 21) | (h3 * ((uint32_t) 1 << 5));
394 s[10] = h3 >> 3;
395 s[11] = h3 >> 11;
396 s[12] = (h3 >> 19) | (h4 * ((uint32_t) 1 << 6));
397 s[13] = h4 >> 2;
398 s[14] = h4 >> 10;
399 s[15] = h4 >> 18;
400 s[16] = h5 >> 0;
401 s[17] = h5 >> 8;
402 s[18] = h5 >> 16;
403 s[19] = (h5 >> 24) | (h6 * ((uint32_t) 1 << 1));
404 s[20] = h6 >> 7;
405 s[21] = h6 >> 15;
406 s[22] = (h6 >> 23) | (h7 * ((uint32_t) 1 << 3));
407 s[23] = h7 >> 5;
408 s[24] = h7 >> 13;
409 s[25] = (h7 >> 21) | (h8 * ((uint32_t) 1 << 4));
410 s[26] = h8 >> 4;
411 s[27] = h8 >> 12;
412 s[28] = (h8 >> 20) | (h9 * ((uint32_t) 1 << 6));
413 s[29] = h9 >> 2;
414 s[30] = h9 >> 10;
415 s[31] = h9 >> 18;
416 }
417
418 /**
419 * return 1 if f is in {1,3,5,...,q-2}
420 * return 0 if f is in {0,2,4,...,q-1}
421 *
422 * Preconditions:
423 * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
424 */
425 static int fe_isnegative(const fe f)
426 {
427 uint8_t s[32];
428
429 fe_tobytes(s,f);
430
431 return s[0] & 1;
432 }
433
434 /**
435 * return 1 if f != 0
436 * return 0 if f == 0
437 *
438 * Preconditions:
439 * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25,etc.
440 */
441 static uint8_t zero[32];
442
443 static int fe_isnonzero(const fe f)
444 {
445 uint8_t s[32];
446
447 fe_tobytes(s, f);
448
449 return !memeq_const(s, zero, 32);
450 }
451
452 /**
453 * h = f * g
454 * Can overlap h with f or g.
455 *
456 * Preconditions:
457 * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
458 * |g| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
459 *
460 * Postconditions:
461 * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
462 */
463
464 /**
465 * Notes on implementation strategy:
466 *
467 * Using schoolbook multiplication.
468 * Karatsuba would save a little in some cost models.
469 *
470 * Most multiplications by 2 and 19 are 32-bit precomputations;
471 * cheaper than 64-bit postcomputations.
472 *
473 * There is one remaining multiplication by 19 in the carry chain;
474 * one *19 precomputation can be merged into this,
475 * but the resulting data flow is considerably less clean.
476 *
477 * There are 12 carries below.
478 * 10 of them are 2-way parallelizable and vectorizable.
479 * Can get away with 11 carries, but then data flow is much deeper.
480 *
481 * With tighter constraints on inputs can squeeze carries into int32.
482 */
483
484 static void fe_mul(fe h, const fe f, const fe g)
485 {
486 int32_t f0 = f[0];
487 int32_t f1 = f[1];
488 int32_t f2 = f[2];
489 int32_t f3 = f[3];
490 int32_t f4 = f[4];
491 int32_t f5 = f[5];
492 int32_t f6 = f[6];
493 int32_t f7 = f[7];
494 int32_t f8 = f[8];
495 int32_t f9 = f[9];
496
497 int32_t g0 = g[0];
498 int32_t g1 = g[1];
499 int32_t g2 = g[2];
500 int32_t g3 = g[3];
501 int32_t g4 = g[4];
502 int32_t g5 = g[5];
503 int32_t g6 = g[6];
504 int32_t g7 = g[7];
505 int32_t g8 = g[8];
506 int32_t g9 = g[9];
507
508 int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
509 int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
510 int32_t g3_19 = 19 * g3;
511 int32_t g4_19 = 19 * g4;
512 int32_t g5_19 = 19 * g5;
513 int32_t g6_19 = 19 * g6;
514 int32_t g7_19 = 19 * g7;
515 int32_t g8_19 = 19 * g8;
516 int32_t g9_19 = 19 * g9;
517
518 int32_t f1_2 = 2 * f1;
519 int32_t f3_2 = 2 * f3;
520 int32_t f5_2 = 2 * f5;
521 int32_t f7_2 = 2 * f7;
522 int32_t f9_2 = 2 * f9;
523
524 int64_t f0g0 = f0 * (int64_t) g0;
525 int64_t f0g1 = f0 * (int64_t) g1;
526 int64_t f0g2 = f0 * (int64_t) g2;
527 int64_t f0g3 = f0 * (int64_t) g3;
528 int64_t f0g4 = f0 * (int64_t) g4;
529 int64_t f0g5 = f0 * (int64_t) g5;
530 int64_t f0g6 = f0 * (int64_t) g6;
531 int64_t f0g7 = f0 * (int64_t) g7;
532 int64_t f0g8 = f0 * (int64_t) g8;
533 int64_t f0g9 = f0 * (int64_t) g9;
534
535 int64_t f1g0 = f1 * (int64_t) g0;
536 int64_t f1g1_2 = f1_2 * (int64_t) g1;
537 int64_t f1g2 = f1 * (int64_t) g2;
538 int64_t f1g3_2 = f1_2 * (int64_t) g3;
539 int64_t f1g4 = f1 * (int64_t) g4;
540 int64_t f1g5_2 = f1_2 * (int64_t) g5;
541 int64_t f1g6 = f1 * (int64_t) g6;
542 int64_t f1g7_2 = f1_2 * (int64_t) g7;
543 int64_t f1g8 = f1 * (int64_t) g8;
544 int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
545
546 int64_t f2g0 = f2 * (int64_t) g0;
547 int64_t f2g1 = f2 * (int64_t) g1;
548 int64_t f2g2 = f2 * (int64_t) g2;
549 int64_t f2g3 = f2 * (int64_t) g3;
550 int64_t f2g4 = f2 * (int64_t) g4;
551 int64_t f2g5 = f2 * (int64_t) g5;
552 int64_t f2g6 = f2 * (int64_t) g6;
553 int64_t f2g7 = f2 * (int64_t) g7;
554 int64_t f2g8_19 = f2 * (int64_t) g8_19;
555 int64_t f2g9_19 = f2 * (int64_t) g9_19;
556
557 int64_t f3g0 = f3 * (int64_t) g0;
558 int64_t f3g1_2 = f3_2 * (int64_t) g1;
559 int64_t f3g2 = f3 * (int64_t) g2;
560 int64_t f3g3_2 = f3_2 * (int64_t) g3;
561 int64_t f3g4 = f3 * (int64_t) g4;
562 int64_t f3g5_2 = f3_2 * (int64_t) g5;
563 int64_t f3g6 = f3 * (int64_t) g6;
564 int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
565 int64_t f3g8_19 = f3 * (int64_t) g8_19;
566 int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
567
568 int64_t f4g0 = f4 * (int64_t) g0;
569 int64_t f4g1 = f4 * (int64_t) g1;
570 int64_t f4g2 = f4 * (int64_t) g2;
571 int64_t f4g3 = f4 * (int64_t) g3;
572 int64_t f4g4 = f4 * (int64_t) g4;
573 int64_t f4g5 = f4 * (int64_t) g5;
574 int64_t f4g6_19 = f4 * (int64_t) g6_19;
575 int64_t f4g7_19 = f4 * (int64_t) g7_19;
576 int64_t f4g8_19 = f4 * (int64_t) g8_19;
577 int64_t f4g9_19 = f4 * (int64_t) g9_19;
578
579 int64_t f5g0 = f5 * (int64_t) g0;
580 int64_t f5g1_2 = f5_2 * (int64_t) g1;
581 int64_t f5g2 = f5 * (int64_t) g2;
582 int64_t f5g3_2 = f5_2 * (int64_t) g3;
583 int64_t f5g4 = f5 * (int64_t) g4;
584 int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
585 int64_t f5g6_19 = f5 * (int64_t) g6_19;
586 int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
587 int64_t f5g8_19 = f5 * (int64_t) g8_19;
588 int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
589
590 int64_t f6g0 = f6 * (int64_t) g0;
591 int64_t f6g1 = f6 * (int64_t) g1;
592 int64_t f6g2 = f6 * (int64_t) g2;
593 int64_t f6g3 = f6 * (int64_t) g3;
594 int64_t f6g4_19 = f6 * (int64_t) g4_19;
595 int64_t f6g5_19 = f6 * (int64_t) g5_19;
596 int64_t f6g6_19 = f6 * (int64_t) g6_19;
597 int64_t f6g7_19 = f6 * (int64_t) g7_19;
598 int64_t f6g8_19 = f6 * (int64_t) g8_19;
599 int64_t f6g9_19 = f6 * (int64_t) g9_19;
600
601 int64_t f7g0 = f7 * (int64_t) g0;
602 int64_t f7g1_2 = f7_2 * (int64_t) g1;
603 int64_t f7g2 = f7 * (int64_t) g2;
604 int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
605 int64_t f7g4_19 = f7 * (int64_t) g4_19;
606 int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
607 int64_t f7g6_19 = f7 * (int64_t) g6_19;
608 int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
609 int64_t f7g8_19 = f7 * (int64_t) g8_19;
610 int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
611
612 int64_t f8g0 = f8 * (int64_t) g0;
613 int64_t f8g1 = f8 * (int64_t) g1;
614 int64_t f8g2_19 = f8 * (int64_t) g2_19;
615 int64_t f8g3_19 = f8 * (int64_t) g3_19;
616 int64_t f8g4_19 = f8 * (int64_t) g4_19;
617 int64_t f8g5_19 = f8 * (int64_t) g5_19;
618 int64_t f8g6_19 = f8 * (int64_t) g6_19;
619 int64_t f8g7_19 = f8 * (int64_t) g7_19;
620 int64_t f8g8_19 = f8 * (int64_t) g8_19;
621 int64_t f8g9_19 = f8 * (int64_t) g9_19;
622
623 int64_t f9g0 = f9 * (int64_t) g0;
624 int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
625 int64_t f9g2_19 = f9 * (int64_t) g2_19;
626 int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
627 int64_t f9g4_19 = f9 * (int64_t) g4_19;
628 int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
629 int64_t f9g6_19 = f9 * (int64_t) g6_19;
630 int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
631 int64_t f9g8_19 = f9 * (int64_t) g8_19;
632 int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
633
634 int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 +
635 f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
636 int64_t h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 +
637 f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
638 int64_t h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 +
639 f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
640 int64_t h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 +
641 f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
642 int64_t h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 +
643 f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
644 int64_t h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 +
645 f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
646 int64_t h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 +
647 f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
648 int64_t h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 +
649 f6g1 + f7g0 + f8g9_19 + f9g8_19;
650 int64_t h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 +
651 f6g2 + f7g1_2 + f8g0 + f9g9_38;
652 int64_t h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 +
653 f6g3 + f7g2 + f8g1 + f9g0 ;
654
655 int64_t carry0, carry1, carry2, carry3, carry4;
656 int64_t carry5, carry6, carry7, carry8, carry9;
657
658 /**
659 * |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
660 * i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
661 * |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
662 * i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
663 */
664
665 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
666 h1 += carry0;
667 h0 -= carry0 * ((uint64_t) 1L << 26);
668 /* |h0| <= 2^25 */
669 /* |h1| <= 1.71*2^59 */
670
671 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
672 h5 += carry4;
673 h4 -= carry4 * ((uint64_t) 1L << 26);
674 /* |h4| <= 2^25 */
675 /* |h5| <= 1.71*2^59 */
676
677 carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
678 h2 += carry1;
679 h1 -= carry1 * ((uint64_t) 1L << 25);
680 /* |h1| <= 2^24; from now on fits into int32 */
681 /* |h2| <= 1.41*2^60 */
682
683 carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
684 h6 += carry5;
685 h5 -= carry5 * ((uint64_t) 1L << 25);
686 /* |h5| <= 2^24; from now on fits into int32 */
687 /* |h6| <= 1.41*2^60 */
688
689 carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
690 h3 += carry2;
691 h2 -= carry2 * ((uint64_t) 1L << 26);
692 /* |h2| <= 2^25; from now on fits into int32 unchanged */
693 /* |h3| <= 1.71*2^59 */
694
695 carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
696 h7 += carry6;
697 h6 -= carry6 * ((uint64_t) 1L << 26);
698 /* |h6| <= 2^25; from now on fits into int32 unchanged */
699 /* |h7| <= 1.71*2^59 */
700
701 carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
702 h4 += carry3;
703 h3 -= carry3 * ((uint64_t) 1L << 25);
704 /* |h3| <= 2^24; from now on fits into int32 unchanged */
705 /* |h4| <= 1.72*2^34 */
706
707 carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
708 h8 += carry7;
709 h7 -= carry7 * ((uint64_t) 1L << 25);
710 /* |h7| <= 2^24; from now on fits into int32 unchanged */
711 /* |h8| <= 1.41*2^60 */
712
713 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
714 h5 += carry4;
715 h4 -= carry4 * ((uint64_t) 1L << 26);
716 /* |h4| <= 2^25; from now on fits into int32 unchanged */
717 /* |h5| <= 1.01*2^24 */
718
719 carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
720 h9 += carry8;
721 h8 -= carry8 * ((uint64_t) 1L << 26);
722 /* |h8| <= 2^25; from now on fits into int32 unchanged */
723 /* |h9| <= 1.71*2^59 */
724
725 carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
726 h0 += carry9 * 19;
727 h9 -= carry9 * ((uint64_t) 1L << 25);
728 /* |h9| <= 2^24; from now on fits into int32 unchanged */
729 /* |h0| <= 1.1*2^39 */
730
731 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
732 h1 += carry0;
733 h0 -= carry0 * ((uint64_t) 1L << 26);
734 /* |h0| <= 2^25; from now on fits into int32 unchanged */
735 /* |h1| <= 1.01*2^24 */
736
737 h[0] = (int32_t) h0;
738 h[1] = (int32_t) h1;
739 h[2] = (int32_t) h2;
740 h[3] = (int32_t) h3;
741 h[4] = (int32_t) h4;
742 h[5] = (int32_t) h5;
743 h[6] = (int32_t) h6;
744 h[7] = (int32_t) h7;
745 h[8] = (int32_t) h8;
746 h[9] = (int32_t) h9;
747 }
748
749 /**
750 * h = -f
751 *
752 * Preconditions:
753 * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
754 *
755 * Postconditions:
756 * |h| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
757 */
758 static void fe_neg(fe h,const fe f)
759 {
760 int32_t f0 = f[0];
761 int32_t f1 = f[1];
762 int32_t f2 = f[2];
763 int32_t f3 = f[3];
764 int32_t f4 = f[4];
765 int32_t f5 = f[5];
766 int32_t f6 = f[6];
767 int32_t f7 = f[7];
768 int32_t f8 = f[8];
769 int32_t f9 = f[9];
770
771 int32_t h0 = -f0;
772 int32_t h1 = -f1;
773 int32_t h2 = -f2;
774 int32_t h3 = -f3;
775 int32_t h4 = -f4;
776 int32_t h5 = -f5;
777 int32_t h6 = -f6;
778 int32_t h7 = -f7;
779 int32_t h8 = -f8;
780 int32_t h9 = -f9;
781
782 h[0] = h0;
783 h[1] = h1;
784 h[2] = h2;
785 h[3] = h3;
786 h[4] = h4;
787 h[5] = h5;
788 h[6] = h6;
789 h[7] = h7;
790 h[8] = h8;
791 h[9] = h9;
792 }
793
794 /**
795 * h = f * f
796 * Can overlap h with f.
797 *
798 * Preconditions:
799 * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
800 *
801 * Postconditions:
802 * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
803 *
804 * See fe_mul.c for discussion of implementation strategy.
805 */
806 static void fe_sq(fe h, const fe f)
807 {
808 int32_t f0 = f[0];
809 int32_t f1 = f[1];
810 int32_t f2 = f[2];
811 int32_t f3 = f[3];
812 int32_t f4 = f[4];
813 int32_t f5 = f[5];
814 int32_t f6 = f[6];
815 int32_t f7 = f[7];
816 int32_t f8 = f[8];
817 int32_t f9 = f[9];
818
819 int32_t f0_2 = 2 * f0;
820 int32_t f1_2 = 2 * f1;
821 int32_t f2_2 = 2 * f2;
822 int32_t f3_2 = 2 * f3;
823 int32_t f4_2 = 2 * f4;
824 int32_t f5_2 = 2 * f5;
825 int32_t f6_2 = 2 * f6;
826 int32_t f7_2 = 2 * f7;
827
828 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
829 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
830 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
831 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
832 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
833
834 int64_t f0f0 = f0 * (int64_t) f0;
835 int64_t f0f1_2 = f0_2 * (int64_t) f1;
836 int64_t f0f2_2 = f0_2 * (int64_t) f2;
837 int64_t f0f3_2 = f0_2 * (int64_t) f3;
838 int64_t f0f4_2 = f0_2 * (int64_t) f4;
839 int64_t f0f5_2 = f0_2 * (int64_t) f5;
840 int64_t f0f6_2 = f0_2 * (int64_t) f6;
841 int64_t f0f7_2 = f0_2 * (int64_t) f7;
842 int64_t f0f8_2 = f0_2 * (int64_t) f8;
843 int64_t f0f9_2 = f0_2 * (int64_t) f9;
844
845 int64_t f1f1_2 = f1_2 * (int64_t) f1;
846 int64_t f1f2_2 = f1_2 * (int64_t) f2;
847 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
848 int64_t f1f4_2 = f1_2 * (int64_t) f4;
849 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
850 int64_t f1f6_2 = f1_2 * (int64_t) f6;
851 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
852 int64_t f1f8_2 = f1_2 * (int64_t) f8;
853 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
854
855 int64_t f2f2 = f2 * (int64_t) f2;
856 int64_t f2f3_2 = f2_2 * (int64_t) f3;
857 int64_t f2f4_2 = f2_2 * (int64_t) f4;
858 int64_t f2f5_2 = f2_2 * (int64_t) f5;
859 int64_t f2f6_2 = f2_2 * (int64_t) f6;
860 int64_t f2f7_2 = f2_2 * (int64_t) f7;
861 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
862 int64_t f2f9_38 = f2 * (int64_t) f9_38;
863
864 int64_t f3f3_2 = f3_2 * (int64_t) f3;
865 int64_t f3f4_2 = f3_2 * (int64_t) f4;
866 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
867 int64_t f3f6_2 = f3_2 * (int64_t) f6;
868 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
869 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
870 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
871
872 int64_t f4f4 = f4 * (int64_t) f4;
873 int64_t f4f5_2 = f4_2 * (int64_t) f5;
874 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
875 int64_t f4f7_38 = f4 * (int64_t) f7_38;
876 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
877 int64_t f4f9_38 = f4 * (int64_t) f9_38;
878
879 int64_t f5f5_38 = f5 * (int64_t) f5_38;
880 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
881 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
882 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
883 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
884
885 int64_t f6f6_19 = f6 * (int64_t) f6_19;
886 int64_t f6f7_38 = f6 * (int64_t) f7_38;
887 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
888 int64_t f6f9_38 = f6 * (int64_t) f9_38;
889
890 int64_t f7f7_38 = f7 * (int64_t) f7_38;
891 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
892 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
893
894 int64_t f8f8_19 = f8 * (int64_t) f8_19;
895 int64_t f8f9_38 = f8 * (int64_t) f9_38;
896
897 int64_t f9f9_38 = f9 * (int64_t) f9_38;
898
899 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
900 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
901 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
902 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
903 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
904 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
905 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
906 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
907 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
908 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
909
910 int64_t carry0, carry1, carry2, carry3, carry4;
911 int64_t carry5, carry6, carry7, carry8, carry9;
912
913 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
914 h1 += carry0;
915 h0 -= carry0 * ((uint64_t) 1L << 26);
916
917 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
918 h5 += carry4;
919 h4 -= carry4 * ((uint64_t) 1L << 26);
920
921 carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
922 h2 += carry1;
923 h1 -= carry1 * ((uint64_t) 1L << 25);
924
925 carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
926 h6 += carry5;
927 h5 -= carry5 * ((uint64_t) 1L << 25);
928
929 carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
930 h3 += carry2;
931 h2 -= carry2 * ((uint64_t) 1L << 26);
932
933 carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
934 h7 += carry6;
935 h6 -= carry6 * ((uint64_t) 1L << 26);
936
937 carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
938 h4 += carry3;
939 h3 -= carry3 * ((uint64_t) 1L << 25);
940
941 carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
942 h8 += carry7;
943 h7 -= carry7 * ((uint64_t) 1L << 25);
944
945 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
946 h5 += carry4;
947 h4 -= carry4 * ((uint64_t) 1L << 26);
948
949 carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
950 h9 += carry8;
951 h8 -= carry8 * ((uint64_t) 1L << 26);
952
953 carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
954 h0 += carry9 * 19;
955 h9 -= carry9 * ((uint64_t) 1L << 25);
956
957 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
958 h1 += carry0;
959 h0 -= carry0 * ((uint64_t) 1L << 26);
960
961 h[0] = (int32_t) h0;
962 h[1] = (int32_t) h1;
963 h[2] = (int32_t) h2;
964 h[3] = (int32_t) h3;
965 h[4] = (int32_t) h4;
966 h[5] = (int32_t) h5;
967 h[6] = (int32_t) h6;
968 h[7] = (int32_t) h7;
969 h[8] = (int32_t) h8;
970 h[9] = (int32_t) h9;
971 }
972
973 /**
974 * h = 2 * f * f
975 * Can overlap h with f.
976 *
977 * Preconditions:
978 *|f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
979 *
980 * Postconditions:
981 * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
982 *
983 * See fe_mul.c for discussion of implementation strategy.
984 */
985 static void fe_sq2(fe h, const fe f)
986 {
987 int32_t f0 = f[0];
988 int32_t f1 = f[1];
989 int32_t f2 = f[2];
990 int32_t f3 = f[3];
991 int32_t f4 = f[4];
992 int32_t f5 = f[5];
993 int32_t f6 = f[6];
994 int32_t f7 = f[7];
995 int32_t f8 = f[8];
996 int32_t f9 = f[9];
997
998 int32_t f0_2 = 2 * f0;
999 int32_t f1_2 = 2 * f1;
1000 int32_t f2_2 = 2 * f2;
1001 int32_t f3_2 = 2 * f3;
1002 int32_t f4_2 = 2 * f4;
1003 int32_t f5_2 = 2 * f5;
1004 int32_t f6_2 = 2 * f6;
1005 int32_t f7_2 = 2 * f7;
1006
1007 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1008 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1009 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1010 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1011 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1012
1013 int64_t f0f0 = f0 * (int64_t) f0;
1014 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1015 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1016 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1017 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1018 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1019 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1020 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1021 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1022 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1023
1024 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1025 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1026 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1027 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1028 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1029 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1030 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1031 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1032 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1033
1034 int64_t f2f2 = f2 * (int64_t) f2;
1035 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1036 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1037 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1038 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1039 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1040 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1041 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1042
1043 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1044 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1045 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1046 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1047 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1048 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1049 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1050
1051 int64_t f4f4 = f4 * (int64_t) f4;
1052 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1053 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1054 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1055 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1056 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1057
1058 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1059 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1060 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1061 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1062 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1063
1064 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1065 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1066 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1067 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1068
1069 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1070 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1071 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1072
1073 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1074 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1075
1076 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1077
1078 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1079 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1080 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1081 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1082 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1083 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1084 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1085 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1086 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1087 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1088
1089 int64_t carry0, carry1, carry2, carry3, carry4;
1090 int64_t carry5, carry6, carry7, carry8, carry9;
1091
1092 h0 += h0;
1093 h1 += h1;
1094 h2 += h2;
1095 h3 += h3;
1096 h4 += h4;
1097 h5 += h5;
1098 h6 += h6;
1099 h7 += h7;
1100 h8 += h8;
1101 h9 += h9;
1102
1103 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
1104 h1 += carry0;
1105 h0 -= carry0 * ((uint64_t) 1L << 26);
1106
1107 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
1108 h5 += carry4;
1109 h4 -= carry4 * ((uint64_t) 1L << 26);
1110
1111 carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
1112 h2 += carry1;
1113 h1 -= carry1 * ((uint64_t) 1L << 25);
1114
1115 carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
1116 h6 += carry5;
1117 h5 -= carry5 * ((uint64_t) 1L << 25);
1118
1119 carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
1120 h3 += carry2;
1121 h2 -= carry2 * ((uint64_t) 1L << 26);
1122
1123 carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
1124 h7 += carry6;
1125 h6 -= carry6 * ((uint64_t) 1L << 26);
1126
1127 carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
1128 h4 += carry3;
1129 h3 -= carry3 * ((uint64_t) 1L << 25);
1130
1131 carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
1132 h8 += carry7;
1133 h7 -= carry7 * ((uint64_t) 1L << 25);
1134
1135 carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
1136 h5 += carry4;
1137 h4 -= carry4 * ((uint64_t) 1L << 26);
1138
1139 carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
1140 h9 += carry8;
1141 h8 -= carry8 * ((uint64_t) 1L << 26);
1142
1143 carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
1144 h0 += carry9 * 19;
1145 h9 -= carry9 * ((uint64_t) 1L << 25);
1146
1147 carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
1148 h1 += carry0;
1149 h0 -= carry0 * ((uint64_t) 1L << 26);
1150
1151 h[0] = (int32_t) h0;
1152 h[1] = (int32_t) h1;
1153 h[2] = (int32_t) h2;
1154 h[3] = (int32_t) h3;
1155 h[4] = (int32_t) h4;
1156 h[5] = (int32_t) h5;
1157 h[6] = (int32_t) h6;
1158 h[7] = (int32_t) h7;
1159 h[8] = (int32_t) h8;
1160 h[9] = (int32_t) h9;
1161 }
1162
1163 static void fe_invert(fe out, const fe z)
1164 {
1165 fe t0, t1, t2, t3;
1166 int i;
1167
1168 fe_sq(t0, z);
1169 fe_sq(t1, t0);
1170 fe_sq(t1, t1);
1171 fe_mul(t1, z, t1);
1172 fe_mul(t0, t0, t1);
1173 fe_sq(t2, t0);
1174 fe_mul(t1, t1, t2);
1175 fe_sq(t2, t1);
1176
1177 for (i = 1; i < 5; ++i)
1178 {
1179 fe_sq(t2, t2);
1180 }
1181
1182 fe_mul(t1, t2, t1);
1183 fe_sq(t2, t1);
1184
1185 for (i = 1; i < 10; ++i)
1186 {
1187 fe_sq(t2, t2);
1188 }
1189
1190 fe_mul(t2, t2, t1);
1191 fe_sq(t3, t2);
1192
1193 for (i = 1; i < 20; ++i)
1194 {
1195 fe_sq(t3, t3);
1196 }
1197
1198 fe_mul(t2, t3, t2);
1199 fe_sq(t2, t2);
1200
1201 for (i = 1; i < 10; ++i)
1202 {
1203 fe_sq(t2, t2);
1204 }
1205
1206 fe_mul(t1, t2, t1);
1207 fe_sq(t2, t1);
1208
1209 for (i = 1; i < 50; ++i)
1210 {
1211 fe_sq(t2, t2);
1212 }
1213
1214 fe_mul(t2, t2, t1);
1215 fe_sq(t3, t2);
1216
1217 for (i = 1; i < 100; ++i)
1218 {
1219 fe_sq(t3, t3);
1220 }
1221
1222 fe_mul(t2, t3, t2);
1223 fe_sq(t2, t2);
1224
1225 for (i = 1; i < 50; ++i)
1226 {
1227 fe_sq(t2, t2);
1228 }
1229
1230 fe_mul(t1, t2, t1);
1231 fe_sq(t1, t1);
1232
1233 for (i = 1; i < 5; ++i)
1234 {
1235 fe_sq(t1, t1);
1236 }
1237
1238 fe_mul(out, t1, t0);
1239 }
1240
1241 static void fe_pow22523(fe out, const fe z)
1242 {
1243 fe t0, t1, t2;
1244 int i;
1245
1246 fe_sq(t0, z);
1247 fe_sq(t1, t0);
1248 fe_sq(t1, t1);
1249 fe_mul(t1, z, t1);
1250 fe_mul(t0, t0, t1);
1251 fe_sq(t0, t0);
1252 fe_mul(t0, t1, t0);
1253 fe_sq(t1, t0);
1254
1255 for (i = 1; i < 5; ++i)
1256 {
1257 fe_sq(t1, t1);
1258 }
1259
1260 fe_mul(t0, t1, t0);
1261 fe_sq(t1, t0);
1262
1263 for (i = 1; i < 10; ++i)
1264 {
1265 fe_sq(t1, t1);
1266 }
1267
1268 fe_mul(t1, t1, t0);
1269 fe_sq(t2, t1);
1270
1271 for (i = 1; i < 20; ++i)
1272 {
1273 fe_sq(t2, t2);
1274 }
1275
1276 fe_mul(t1, t2, t1);
1277 fe_sq(t1, t1);
1278
1279 for (i = 1; i < 10; ++i)
1280 {
1281 fe_sq(t1, t1);
1282 }
1283
1284 fe_mul(t0, t1, t0);
1285 fe_sq(t1, t0);
1286
1287 for (i = 1; i < 50; ++i)
1288 {
1289 fe_sq(t1, t1);
1290 }
1291
1292 fe_mul(t1, t1, t0);
1293 fe_sq(t2, t1);
1294 for (i = 1; i < 100; ++i)
1295 {
1296 fe_sq(t2, t2);
1297 }
1298
1299 fe_mul(t1, t2, t1);
1300 fe_sq(t1, t1);
1301
1302 for (i = 1; i < 50; ++i)
1303 {
1304 fe_sq(t1, t1);
1305 }
1306
1307 fe_mul(t0, t1, t0);
1308 fe_sq(t0, t0);
1309 fe_sq(t0, t0);
1310 fe_mul(out, t0, z);
1311 }
1312
1313 /**
1314 * h = f - g
1315 * Can overlap h with f or g.
1316 *
1317 * Preconditions:
1318 * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
1319 * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
1320 *
1321 * Postconditions:
1322 * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
1323 */
1324 static void fe_sub(fe h, const fe f, const fe g)
1325 {
1326 int32_t f0 = f[0];
1327 int32_t f1 = f[1];
1328 int32_t f2 = f[2];
1329 int32_t f3 = f[3];
1330 int32_t f4 = f[4];
1331 int32_t f5 = f[5];
1332 int32_t f6 = f[6];
1333 int32_t f7 = f[7];
1334 int32_t f8 = f[8];
1335 int32_t f9 = f[9];
1336
1337 int32_t g0 = g[0];
1338 int32_t g1 = g[1];
1339 int32_t g2 = g[2];
1340 int32_t g3 = g[3];
1341 int32_t g4 = g[4];
1342 int32_t g5 = g[5];
1343 int32_t g6 = g[6];
1344 int32_t g7 = g[7];
1345 int32_t g8 = g[8];
1346 int32_t g9 = g[9];
1347
1348 int32_t h0 = f0 - g0;
1349 int32_t h1 = f1 - g1;
1350 int32_t h2 = f2 - g2;
1351 int32_t h3 = f3 - g3;
1352 int32_t h4 = f4 - g4;
1353 int32_t h5 = f5 - g5;
1354 int32_t h6 = f6 - g6;
1355 int32_t h7 = f7 - g7;
1356 int32_t h8 = f8 - g8;
1357 int32_t h9 = f9 - g9;
1358
1359 h[0] = h0;
1360 h[1] = h1;
1361 h[2] = h2;
1362 h[3] = h3;
1363 h[4] = h4;
1364 h[5] = h5;
1365 h[6] = h6;
1366 h[7] = h7;
1367 h[8] = h8;
1368 h[9] = h9;
1369 }
1370
1371 /**
1372 * r = p + q
1373 */
1374 static void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
1375 {
1376 fe t0;
1377
1378 fe_add(r->X, p->Y, p->X);
1379 fe_sub(r->Y, p->Y, p->X);
1380 fe_mul(r->Z, r->X, q->YplusX);
1381 fe_mul(r->Y, r->Y, q->YminusX);
1382 fe_mul(r->T, q->T2d, p->T);
1383 fe_mul(r->X, p->Z, q->Z);
1384 fe_add(t0, r->X, r->X);
1385 fe_sub(r->X, r->Z, r->Y);
1386 fe_add(r->Y, r->Z, r->Y);
1387 fe_add(r->Z, t0, r->T);
1388 fe_sub(r->T, t0, r->T);
1389 }
1390
1391 static void slide(int8_t *r, const uint8_t *a)
1392 {
1393 int i, b, k;
1394
1395 for (i = 0; i < 256; ++i)
1396 {
1397 r[i] = 1 & (a[i >> 3] >> (i & 7));
1398 }
1399
1400 for (i = 0; i < 256; ++i)
1401 {
1402 if (r[i])
1403 {
1404 for (b = 1; b <= 6 && i + b < 256; ++b)
1405 {
1406 if (r[i + b])
1407 {
1408 if (r[i] + (r[i + b] << b) <= 15)
1409 {
1410 r[i] += r[i + b] << b; r[i + b] = 0;
1411 }
1412 else if (r[i] - (r[i + b] << b) >= -15)
1413 {
1414 r[i] -= r[i + b] << b;
1415
1416 for (k = i + b; k < 256; ++k)
1417 {
1418 if (!r[k])
1419 {
1420 r[k] = 1;
1421 break;
1422 }
1423 r[k] = 0;
1424 }
1425 }
1426 else
1427 {
1428 break;
1429 }
1430 }
1431 }
1432 }
1433 }
1434 }
1435
1436 static const ge_precomp Bi[8] = {
1437 #include "base2.h"
1438 };
1439
1440 /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
1441 static const fe d = {
1442 -10913610, 13857413, -15372611, 6949391, 114729,
1443 -8787816, -6275908, -3247719, -18696448, -12055116
1444 };
1445
1446 /* sqrt(-1) */
1447 static const fe sqrtm1 = {
1448 -32595792, -7943725, 9377950, 3500415, 12389472,
1449 -272473, -25146209, -2005654, 326686, 11406482
1450 };
1451
1452 int ge_frombytes_negate_vartime(ge_p3 *h, const uint8_t *s)
1453 {
1454 fe u, v, v3, vxx, check;
1455
1456 fe_frombytes(h->Y,s);
1457 fe_1(h->Z);
1458 fe_sq(u,h->Y);
1459 fe_mul(v,u,d);
1460 fe_sub(u,u,h->Z); /* u = y^2-1 */
1461 fe_add(v,v,h->Z); /* v = dy^2+1 */
1462
1463 fe_sq(v3,v);
1464 fe_mul(v3,v3,v); /* v3 = v^3 */
1465 fe_sq(h->X,v3);
1466 fe_mul(h->X,h->X,v);
1467 fe_mul(h->X,h->X,u); /* x = uv^7 */
1468
1469 fe_pow22523(h->X,h->X); /* x = (uv^7)^((q-5)/8) */
1470 fe_mul(h->X,h->X,v3);
1471 fe_mul(h->X,h->X,u); /* x = uv^3(uv^7)^((q-5)/8) */
1472
1473 fe_sq(vxx,h->X);
1474 fe_mul(vxx,vxx,v);
1475 fe_sub(check,vxx,u); /* vx^2-u */
1476
1477 if (fe_isnonzero(check))
1478 {
1479 fe_add(check,vxx,u); /* vx^2+u */
1480
1481 if (fe_isnonzero(check))
1482 {
1483 return -1;
1484 }
1485 fe_mul(h->X,h->X,sqrtm1);
1486 }
1487
1488 if (fe_isnegative(h->X) == (s[31] >> 7))
1489 {
1490 fe_neg(h->X,h->X);
1491 }
1492 fe_mul(h->T,h->X,h->Y);
1493
1494 return 0;
1495 }
1496
1497 /**
1498 * r = p + q
1499 */
1500 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
1501 {
1502 fe t0;
1503
1504 fe_add(r->X, p->Y, p->X);
1505 fe_sub(r->Y, p->Y, p->X);
1506 fe_mul(r->Z, r->X, q->yplusx);
1507 fe_mul(r->Y, r->Y, q->yminusx);
1508 fe_mul(r->T, q->xy2d, p->T);
1509 fe_add(t0, p->Z, p->Z);
1510 fe_sub(r->X, r->Z, r->Y);
1511 fe_add(r->Y, r->Z, r->Y);
1512 fe_add(r->Z, t0, r->T);
1513 fe_sub(r->T, t0, r->T);
1514 }
1515
1516 /**
1517 * r = p - q
1518 */
1519 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
1520 {
1521 fe t0;
1522
1523 fe_add(r->X, p->Y, p->X);
1524 fe_sub(r->Y, p->Y, p->X);
1525 fe_mul(r->Z, r->X, q->yminusx);
1526 fe_mul(r->Y, r->Y, q->yplusx);
1527 fe_mul(r->T, q->xy2d, p->T);
1528 fe_add(t0, p->Z, p->Z);
1529 fe_sub(r->X, r->Z, r->Y);
1530 fe_add(r->Y, r->Z, r->Y);
1531 fe_sub(r->Z, t0, r->T);
1532 fe_add(r->T, t0, r->T);
1533 }
1534
1535 /**
1536 * r = p
1537 */
1538 static void ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p)
1539 {
1540 fe_mul(r->X,p->X,p->T);
1541 fe_mul(r->Y,p->Y,p->Z);
1542 fe_mul(r->Z,p->Z,p->T);
1543 }
1544
1545 /**
1546 * r = p
1547 */
1548 static void ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p)
1549 {
1550 fe_mul(r->X,p->X,p->T);
1551 fe_mul(r->Y,p->Y,p->Z);
1552 fe_mul(r->Z,p->Z,p->T);
1553 fe_mul(r->T,p->X,p->Y);
1554 }
1555
1556 static void ge_p2_0(ge_p2 *h)
1557 {
1558 fe_0(h->X);
1559 fe_1(h->Y);
1560 fe_1(h->Z);
1561 }
1562
1563 /**
1564 * r = 2 * p
1565 */
1566 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p)
1567 {
1568 fe t0;
1569
1570 fe_sq(r->X, p->X);
1571 fe_sq(r->Z, p->Y);
1572 fe_sq2(r->T, p->Z);
1573 fe_add(r->Y, p->X, p->Y);
1574 fe_sq(t0, r->Y);
1575 fe_add(r->Y, r->Z, r->X);
1576 fe_sub(r->Z, r->Z, r->X);
1577 fe_sub(r->X, t0, r->Y);
1578 fe_sub(r->T, r->T, r->Z);
1579 }
1580
1581 static void ge_p3_0(ge_p3 *h)
1582 {
1583 fe_0(h->X);
1584 fe_1(h->Y);
1585 fe_1(h->Z);
1586 fe_0(h->T);
1587 }
1588
1589 /**
1590 * r = p
1591 */
1592
1593 /* 2 * d = 16295367250680780974490674513165176452449235426866156013048779062215315747161 */
1594 static const fe d2 = {
1595 -21827239, -5839606, -30745221, 13898782, 229458,
1596 15978800, -12551817, -6495438, 29715968, 9444199
1597 };
1598
1599 static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p)
1600 {
1601 fe_add(r->YplusX,p->Y,p->X);
1602 fe_sub(r->YminusX,p->Y,p->X);
1603 fe_copy(r->Z,p->Z);
1604 fe_mul(r->T2d,p->T,d2);
1605 }
1606
1607 /**
1608 * r = p
1609 */
1610 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p)
1611 {
1612 fe_copy(r->X,p->X);
1613 fe_copy(r->Y,p->Y);
1614 fe_copy(r->Z,p->Z);
1615 }
1616
1617 void ge_p3_tobytes(uint8_t *s, const ge_p3 *h)
1618 {
1619 fe recip, x, y;
1620
1621 fe_invert(recip,h->Z);
1622 fe_mul(x,h->X,recip);
1623 fe_mul(y,h->Y,recip);
1624 fe_tobytes(s,y);
1625
1626 s[31] ^= fe_isnegative(x) << 7;
1627 }
1628
1629 /**
1630 * r = 2 * p
1631 */
1632 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p)
1633 {
1634 ge_p2 q;
1635 ge_p3_to_p2(&q,p);
1636 ge_p2_dbl(r,&q);
1637 }
1638
1639 static void ge_precomp_0(ge_precomp *h)
1640 {
1641 fe_1(h->yplusx);
1642 fe_1(h->yminusx);
1643 fe_0(h->xy2d);
1644 }
1645
1646 static uint8_t equal(int8_t b, int8_t c)
1647 {
1648 uint8_t ub = b;
1649 uint8_t uc = c;
1650 uint8_t x = ub ^ uc; /* 0: yes; 1..255: no */
1651 uint32_t y = x; /* 0: yes; 1..255: no */
1652
1653 y -= 1; /* 4294967295: yes; 0..254: no */
1654 y >>= 31; /* 1: yes; 0: no */
1655
1656 return y;
1657 }
1658
1659 static uint8_t negative(int8_t b)
1660 {
1661 uint64_t x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
1662
1663 x >>= 63; /* 1: yes; 0: no */
1664
1665 return x;
1666 }
1667
1668 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b)
1669 {
1670 fe_cmov(t->yplusx,u->yplusx,b);
1671 fe_cmov(t->yminusx,u->yminusx,b);
1672 fe_cmov(t->xy2d,u->xy2d,b);
1673 }
1674
1675 /**
1676 * base[i][j] = (j+1)*256^i*B
1677 */
1678 static const ge_precomp base[32][8] = {
1679 #include "base.h"
1680 };
1681
1682 static void ge_select(ge_precomp *t, int pos, int8_t b)
1683 {
1684 ge_precomp minust;
1685 uint8_t bnegative = negative(b);
1686 uint8_t babs = b - (((-bnegative) & b) * ((int8_t) 1 << 1));
1687
1688 ge_precomp_0(t);
1689 cmov(t,&base[pos][0],equal(babs,1));
1690 cmov(t,&base[pos][1],equal(babs,2));
1691 cmov(t,&base[pos][2],equal(babs,3));
1692 cmov(t,&base[pos][3],equal(babs,4));
1693 cmov(t,&base[pos][4],equal(babs,5));
1694 cmov(t,&base[pos][5],equal(babs,6));
1695 cmov(t,&base[pos][6],equal(babs,7));
1696 cmov(t,&base[pos][7],equal(babs,8));
1697 fe_copy(minust.yplusx,t->yminusx);
1698 fe_copy(minust.yminusx,t->yplusx);
1699 fe_neg(minust.xy2d,t->xy2d);
1700 cmov(t,&minust,bnegative);
1701 }
1702
1703 /**
1704 *r = p - q
1705 */
1706 static void ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
1707 {
1708 fe t0;
1709
1710 fe_add(r->X, p->Y, p->X);
1711 fe_sub(r->Y, p->Y, p->X);
1712 fe_mul(r->Z, r->X, q->YminusX);
1713 fe_mul(r->Y, r->Y, q->YplusX);
1714 fe_mul(r->T, q->T2d, p->T);
1715 fe_mul(r->X, p->Z, q->Z);
1716 fe_add(t0, r->X, r->X);
1717 fe_sub(r->X, r->Z, r->Y);
1718 fe_add(r->Y, r->Z, r->Y);
1719 fe_sub(r->Z, t0, r->T);
1720 fe_add(r->T, t0, r->T);
1721 }
1722
1723 void ge_tobytes(uint8_t *s, const ge_p2 *h)
1724 {
1725 fe recip, x, y;
1726
1727 fe_invert(recip,h->Z);
1728 fe_mul(x,h->X,recip);
1729 fe_mul(y,h->Y,recip);
1730 fe_tobytes(s,y);
1731
1732 s[31] ^= fe_isnegative(x) << 7;
1733 }
1734
1735 /**
1736 * h = a * B
1737 * where a = a[0]+256*a[1]+...+256^31 a[31]
1738 * B is the Ed25519 base point (x,4/5) with x positive.
1739 *
1740 * Preconditions:
1741 * a[31] <= 127
1742 */
1743
1744 /**
1745 * r = a * A + b * B
1746 * where a = a[0]+256*a[1]+...+256^31 a[31].
1747 * and b = b[0]+256*b[1]+...+256^31 b[31].
1748 * B is the Ed25519 base point (x,4/5) with x positive.
1749 */
1750 void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, const ge_p3 *A,
1751 const uint8_t *b)
1752 {
1753 int8_t aslide[256];
1754 int8_t bslide[256];
1755 ge_cached Ai[8]; /* A,3A,5A,7A,9A,11A,13A,15A */
1756 ge_p1p1 t;
1757 ge_p3 u, A2;
1758 int i;
1759
1760 slide(aslide,a);
1761 slide(bslide,b);
1762
1763 ge_p3_to_cached(&Ai[0],A);
1764 ge_p3_dbl(&t,A);
1765 ge_p1p1_to_p3(&A2,&t);
1766
1767 ge_add(&t,&A2,&Ai[0]);
1768 ge_p1p1_to_p3(&u,&t);
1769 ge_p3_to_cached(&Ai[1],&u);
1770
1771 ge_add(&t,&A2,&Ai[1]);
1772 ge_p1p1_to_p3(&u,&t);
1773 ge_p3_to_cached(&Ai[2],&u);
1774
1775 ge_add(&t,&A2,&Ai[2]);
1776 ge_p1p1_to_p3(&u,&t);
1777 ge_p3_to_cached(&Ai[3],&u);
1778
1779 ge_add(&t,&A2,&Ai[3]);
1780 ge_p1p1_to_p3(&u,&t);
1781 ge_p3_to_cached(&Ai[4],&u);
1782
1783 ge_add(&t,&A2,&Ai[4]);
1784 ge_p1p1_to_p3(&u,&t);
1785 ge_p3_to_cached(&Ai[5],&u);
1786
1787 ge_add(&t,&A2,&Ai[5]);
1788 ge_p1p1_to_p3(&u,&t);
1789 ge_p3_to_cached(&Ai[6],&u);
1790
1791 ge_add(&t,&A2,&Ai[6]);
1792 ge_p1p1_to_p3(&u,&t);
1793 ge_p3_to_cached(&Ai[7],&u);
1794
1795 ge_p2_0(r);
1796
1797 for (i = 255; i >= 0; --i)
1798 {
1799 if (aslide[i] || bslide[i])
1800 {
1801 break;
1802 }
1803 }
1804
1805 for (; i >= 0 ;--i)
1806 {
1807 ge_p2_dbl(&t,r);
1808
1809 if (aslide[i] > 0)
1810 {
1811 ge_p1p1_to_p3(&u,&t);
1812 ge_add(&t,&u,&Ai[aslide[i]/2]);
1813 }
1814 else if (aslide[i] < 0)
1815 {
1816 ge_p1p1_to_p3(&u,&t);
1817 ge_sub(&t,&u,&Ai[(-aslide[i])/2]);
1818 }
1819
1820 if (bslide[i] > 0)
1821 {
1822 ge_p1p1_to_p3(&u,&t);
1823 ge_madd(&t,&u,&Bi[bslide[i]/2]);
1824 }
1825 else if (bslide[i] < 0)
1826 {
1827 ge_p1p1_to_p3(&u,&t);
1828 ge_msub(&t,&u,&Bi[(-bslide[i])/2]);
1829 }
1830 ge_p1p1_to_p2(r,&t);
1831 }
1832 }
1833
1834 void ge_scalarmult_base(ge_p3 *h, const uint8_t *a)
1835 {
1836 int8_t e[64];
1837 int8_t carry = 0;
1838 ge_p1p1 r;
1839 ge_p2 s;
1840 ge_precomp t;
1841 int i;
1842
1843 for (i = 0; i < 32; ++i)
1844 {
1845 e[2 * i + 0] = (a[i] >> 0) & 15;
1846 e[2 * i + 1] = (a[i] >> 4) & 15;
1847 }
1848 /* each e[i] is between 0 and 15 */
1849 /* e[63] is between 0 and 7 */
1850
1851 for (i = 0; i < 63; ++i) {
1852 e[i] += carry;
1853 carry = e[i] + 8;
1854 carry >>= 4;
1855 e[i] -= carry * ((int8_t) 1 << 4);
1856 }
1857 e[63] += carry;
1858 /* each e[i] is between -8 and 8 */
1859
1860 ge_p3_0(h);
1861 for (i = 1; i < 64; i += 2)
1862 {
1863 ge_select(&t,i / 2,e[i]);
1864 ge_madd(&r,h,&t); ge_p1p1_to_p3(h,&r);
1865 }
1866
1867 ge_p3_dbl(&r,h); ge_p1p1_to_p2(&s,&r);
1868 ge_p2_dbl(&r,&s); ge_p1p1_to_p2(&s,&r);
1869 ge_p2_dbl(&r,&s); ge_p1p1_to_p2(&s,&r);
1870 ge_p2_dbl(&r,&s); ge_p1p1_to_p3(h,&r);
1871
1872 for (i = 0; i < 64; i += 2)
1873 {
1874 ge_select(&t,i / 2,e[i]);
1875 ge_madd(&r,h,&t);
1876 ge_p1p1_to_p3(h,&r);
1877 }
1878 }
1879
1880 /**
1881 * Input:
1882 * a[0]+256*a[1]+...+256^31*a[31] = a
1883 * b[0]+256*b[1]+...+256^31*b[31] = b
1884 * c[0]+256*c[1]+...+256^31*c[31] = c
1885 *
1886 * Output:
1887 * s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1888 * where l = 2^252 + 27742317777372353535851937790883648493.
1889 */
1890 void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, const uint8_t *c)
1891 {
1892 int64_t a0 = 2097151 & load_3(a);
1893 int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
1894 int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
1895 int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
1896 int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
1897 int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
1898 int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
1899 int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
1900 int64_t a8 = 2097151 & load_3(a + 21);
1901 int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
1902 int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1903 int64_t a11 = (load_4(a + 28) >> 7);
1904
1905 int64_t b0 = 2097151 & load_3(b);
1906 int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
1907 int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
1908 int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
1909 int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
1910 int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
1911 int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
1912 int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
1913 int64_t b8 = 2097151 & load_3(b + 21);
1914 int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
1915 int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1916 int64_t b11 = (load_4(b + 28) >> 7);
1917
1918 int64_t c0 = 2097151 & load_3(c);
1919 int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
1920 int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
1921 int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
1922 int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
1923 int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
1924 int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
1925 int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
1926 int64_t c8 = 2097151 & load_3(c + 21);
1927 int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
1928 int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1929 int64_t c11 = (load_4(c + 28) >> 7);
1930
1931 int64_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
1932 int64_t s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23;
1933
1934 int64_t carry0, carry1, carry2, carry3, carry4, carry5, carry6;
1935 int64_t carry7, carry8, carry9, carry10, carry11, carry12, carry13;
1936 int64_t carry14, carry15, carry16, carry17, carry18, carry19, carry20;
1937 int64_t carry21, carry22;
1938
1939 s0 = c0 + a0*b0;
1940 s1 = c1 + a0*b1 + a1*b0;
1941 s2 = c2 + a0*b2 + a1*b1 + a2*b0;
1942 s3 = c3 + a0*b3 + a1*b2 + a2*b1 + a3*b0;
1943 s4 = c4 + a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0;
1944 s5 = c5 + a0*b5 + a1*b4 + a2*b3 + a3*b2 + a4*b1 + a5*b0;
1945 s6 = c6 + a0*b6 + a1*b5 + a2*b4 + a3*b3 + a4*b2 + a5*b1 + a6*b0;
1946 s7 = c7 + a0*b7 + a1*b6 + a2*b5 + a3*b4 + a4*b3 + a5*b2 + a6*b1 + a7*b0;
1947 s8 = c8 + a0*b8 + a1*b7 + a2*b6 + a3*b5 + a4*b4 + a5*b3 + a6*b2 + a7*b1 + a8*b0;
1948 s9 = c9 + a0*b9 + a1*b8 + a2*b7 + a3*b6 + a4*b5 + a5*b4 + a6*b3 + a7*b2 + a8*b1 + a9*b0;
1949 s10 = c10 + a0*b10 + a1*b9 + a2*b8 + a3*b7 + a4*b6 + a5*b5 + a6*b4 + a7*b3 + a8*b2 + a9*b1 + a10*b0;
1950 s11 = c11 + a0*b11 + a1*b10 + a2*b9 + a3*b8 + a4*b7 + a5*b6 + a6*b5 + a7*b4 + a8*b3 + a9*b2 + a10*b1 + a11*b0;
1951 s12 = a1*b11 + a2*b10 + a3*b9 + a4*b8 + a5*b7 + a6*b6 + a7*b5 + a8*b4 + a9*b3 + a10*b2 + a11*b1;
1952 s13 = a2*b11 + a3*b10 + a4*b9 + a5*b8 + a6*b7 + a7*b6 + a8*b5 + a9*b4 + a10*b3 + a11*b2;
1953 s14 = a3*b11 + a4*b10 + a5*b9 + a6*b8 + a7*b7 + a8*b6 + a9*b5 + a10*b4 + a11*b3;
1954 s15 = a4*b11 + a5*b10 + a6*b9 + a7*b8 + a8*b7 + a9*b6 + a10*b5 + a11*b4;
1955 s16 = a5*b11 + a6*b10 + a7*b9 + a8*b8 + a9*b7 + a10*b6 + a11*b5;
1956 s17 = a6*b11 + a7*b10 + a8*b9 + a9*b8 + a10*b7 + a11*b6;
1957 s18 = a7*b11 + a8*b10 + a9*b9 + a10*b8 + a11*b7;
1958 s19 = a8*b11 + a9*b10 + a10*b9 + a11*b8;
1959 s20 = a9*b11 + a10*b10 + a11*b9;
1960 s21 = a10*b11 + a11*b10;
1961 s22 = a11*b11;
1962 s23 = 0;
1963
1964 carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
1965 s1 += carry0;
1966 s0 -= carry0 * ((uint64_t) 1L << 21);
1967
1968 carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
1969 s3 += carry2;
1970 s2 -= carry2 * ((uint64_t) 1L << 21);
1971
1972 carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
1973 s5 += carry4;
1974 s4 -= carry4 * ((uint64_t) 1L << 21);
1975
1976 carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1977 s7 += carry6;
1978 s6 -= carry6 * ((uint64_t) 1L << 21);
1979
1980 carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1981 s9 += carry8;
1982 s8 -= carry8 * ((uint64_t) 1L << 21);
1983
1984 carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1985 s11 += carry10;
1986 s10 -= carry10 * ((uint64_t) 1L << 21);
1987
1988 carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
1989 s13 += carry12;
1990 s12 -= carry12 * ((uint64_t) 1L << 21);
1991
1992 carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
1993 s15 += carry14;
1994 s14 -= carry14 * ((uint64_t) 1L << 21);
1995
1996 carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
1997 s17 += carry16;
1998 s16 -= carry16 * ((uint64_t) 1L << 21);
1999
2000 carry18 = (s18 + (int64_t) (1L << 20)) >> 21;
2001 s19 += carry18;
2002 s18 -= carry18 * ((uint64_t) 1L << 21);
2003
2004 carry20 = (s20 + (int64_t) (1L << 20)) >> 21;
2005 s21 += carry20;
2006 s20 -= carry20 * ((uint64_t) 1L << 21);
2007
2008 carry22 = (s22 + (int64_t) (1L << 20)) >> 21;
2009 s23 += carry22;
2010 s22 -= carry22 * ((uint64_t) 1L << 21);
2011
2012 carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
2013 s2 += carry1;
2014 s1 -= carry1 * ((uint64_t) 1L << 21);
2015
2016 carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
2017 s4 += carry3;
2018 s3 -= carry3 * ((uint64_t) 1L << 21);
2019
2020 carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
2021 s6 += carry5;
2022 s5 -= carry5 * ((uint64_t) 1L << 21);
2023
2024 carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2025 s8 += carry7;
2026 s7 -= carry7 * ((uint64_t) 1L << 21);
2027
2028 carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2029 s10 += carry9;
2030 s9 -= carry9 * ((uint64_t) 1L << 21);
2031
2032 carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2033 s12 += carry11;
2034 s11 -= carry11 * ((uint64_t) 1L << 21);
2035
2036 carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
2037 s14 += carry13;
2038 s13 -= carry13 * ((uint64_t) 1L << 21);
2039
2040 carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
2041 s16 += carry15;
2042 s15 -= carry15 * ((uint64_t) 1L << 21);
2043
2044 carry17 = (s17 + (int64_t) (1L << 20)) >> 21;
2045 s18 += carry17;
2046 s17 -= carry17 * ((uint64_t) 1L << 21);
2047
2048 carry19 = (s19 + (int64_t) (1L << 20)) >> 21;
2049 s20 += carry19;
2050 s19 -= carry19 * ((uint64_t) 1L << 21);
2051
2052 carry21 = (s21 + (int64_t) (1L << 20)) >> 21;
2053 s22 += carry21;
2054 s21 -= carry21 * ((uint64_t) 1L << 21);
2055
2056 s11 += s23 * 666643;
2057 s12 += s23 * 470296;
2058 s13 += s23 * 654183;
2059 s14 -= s23 * 997805;
2060 s15 += s23 * 136657;
2061 s16 -= s23 * 683901;
2062
2063 s10 += s22 * 666643;
2064 s11 += s22 * 470296;
2065 s12 += s22 * 654183;
2066 s13 -= s22 * 997805;
2067 s14 += s22 * 136657;
2068 s15 -= s22 * 683901;
2069
2070 s9 += s21 * 666643;
2071 s10 += s21 * 470296;
2072 s11 += s21 * 654183;
2073 s12 -= s21 * 997805;
2074 s13 += s21 * 136657;
2075 s14 -= s21 * 683901;
2076
2077 s8 += s20 * 666643;
2078 s9 += s20 * 470296;
2079 s10 += s20 * 654183;
2080 s11 -= s20 * 997805;
2081 s12 += s20 * 136657;
2082 s13 -= s20 * 683901;
2083
2084 s7 += s19 * 666643;
2085 s8 += s19 * 470296;
2086 s9 += s19 * 654183;
2087 s10 -= s19 * 997805;
2088 s11 += s19 * 136657;
2089 s12 -= s19 * 683901;
2090
2091 s6 += s18 * 666643;
2092 s7 += s18 * 470296;
2093 s8 += s18 * 654183;
2094 s9 -= s18 * 997805;
2095 s10 += s18 * 136657;
2096 s11 -= s18 * 683901;
2097
2098 carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2099 s7 += carry6;
2100 s6 -= carry6 * ((uint64_t) 1L << 21);
2101
2102 carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2103 s9 += carry8;
2104 s8 -= carry8 * ((uint64_t) 1L << 21);
2105
2106 carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2107 s11 += carry10;
2108 s10 -= carry10 * ((uint64_t) 1L << 21);
2109
2110 carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
2111 s13 += carry12;
2112 s12 -= carry12 * ((uint64_t) 1L << 21);
2113
2114 carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
2115 s15 += carry14;
2116 s14 -= carry14 * ((uint64_t) 1L << 21);
2117
2118 carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
2119 s17 += carry16;
2120 s16 -= carry16 * ((uint64_t) 1L << 21);
2121
2122 carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2123 s8 += carry7;
2124 s7 -= carry7 * ((uint64_t) 1L << 21);
2125
2126 carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2127 s10 += carry9;
2128 s9 -= carry9 * ((uint64_t) 1L << 21);
2129
2130 carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2131 s12 += carry11;
2132 s11 -= carry11 * ((uint64_t) 1L << 21);
2133
2134 carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
2135 s14 += carry13;
2136 s13 -= carry13 * ((uint64_t) 1L << 21);
2137
2138 carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
2139 s16 += carry15;
2140 s15 -= carry15 * ((uint64_t) 1L << 21);
2141
2142 s5 += s17 * 666643;
2143 s6 += s17 * 470296;
2144 s7 += s17 * 654183;
2145 s8 -= s17 * 997805;
2146 s9 += s17 * 136657;
2147 s10 -= s17 * 683901;
2148
2149 s4 += s16 * 666643;
2150 s5 += s16 * 470296;
2151 s6 += s16 * 654183;
2152 s7 -= s16 * 997805;
2153 s8 += s16 * 136657;
2154 s9 -= s16 * 683901;
2155
2156 s3 += s15 * 666643;
2157 s4 += s15 * 470296;
2158 s5 += s15 * 654183;
2159 s6 -= s15 * 997805;
2160 s7 += s15 * 136657;
2161 s8 -= s15 * 683901;
2162
2163 s2 += s14 * 666643;
2164 s3 += s14 * 470296;
2165 s4 += s14 * 654183;
2166 s5 -= s14 * 997805;
2167 s6 += s14 * 136657;
2168 s7 -= s14 * 683901;
2169
2170 s1 += s13 * 666643;
2171 s2 += s13 * 470296;
2172 s3 += s13 * 654183;
2173 s4 -= s13 * 997805;
2174 s5 += s13 * 136657;
2175 s6 -= s13 * 683901;
2176
2177 s0 += s12 * 666643;
2178 s1 += s12 * 470296;
2179 s2 += s12 * 654183;
2180 s3 -= s12 * 997805;
2181 s4 += s12 * 136657;
2182 s5 -= s12 * 683901;
2183 s12 = 0;
2184
2185 carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
2186 s1 += carry0;
2187 s0 -= carry0 * ((uint64_t) 1L << 21);
2188
2189 carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
2190 s3 += carry2;
2191 s2 -= carry2 * ((uint64_t) 1L << 21);
2192
2193 carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
2194 s5 += carry4;
2195 s4 -= carry4 * ((uint64_t) 1L << 21);
2196
2197 carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2198 s7 += carry6;
2199 s6 -= carry6 * ((uint64_t) 1L << 21);
2200
2201 carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2202 s9 += carry8;
2203 s8 -= carry8 * ((uint64_t) 1L << 21);
2204
2205 carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2206 s11 += carry10;
2207 s10 -= carry10 * ((uint64_t) 1L << 21);
2208
2209 carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
2210 s2 += carry1;
2211 s1 -= carry1 * ((uint64_t) 1L << 21);
2212
2213 carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
2214 s4 += carry3;
2215 s3 -= carry3 * ((uint64_t) 1L << 21);
2216
2217 carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
2218 s6 += carry5;
2219 s5 -= carry5 * ((uint64_t) 1L << 21);
2220
2221 carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2222 s8 += carry7;
2223 s7 -= carry7 * ((uint64_t) 1L << 21);
2224
2225 carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2226 s10 += carry9;
2227 s9 -= carry9 * ((uint64_t) 1L << 21);
2228
2229 carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2230 s12 += carry11;
2231 s11 -= carry11 * ((uint64_t) 1L << 21);
2232
2233 s0 += s12 * 666643;
2234 s1 += s12 * 470296;
2235 s2 += s12 * 654183;
2236 s3 -= s12 * 997805;
2237 s4 += s12 * 136657;
2238 s5 -= s12 * 683901;
2239 s12 = 0;
2240
2241 carry0 = s0 >> 21;
2242 s1 += carry0;
2243 s0 -= carry0 * ((uint64_t) 1L << 21);
2244
2245 carry1 = s1 >> 21;
2246 s2 += carry1;
2247 s1 -= carry1 * ((uint64_t) 1L << 21);
2248
2249 carry2 = s2 >> 21;
2250 s3 += carry2;
2251 s2 -= carry2 * ((uint64_t) 1L << 21);
2252
2253 carry3 = s3 >> 21;
2254 s4 += carry3;
2255 s3 -= carry3 * ((uint64_t) 1L << 21);
2256
2257 carry4 = s4 >> 21;
2258 s5 += carry4;
2259 s4 -= carry4 * ((uint64_t) 1L << 21);
2260
2261 carry5 = s5 >> 21;
2262 s6 += carry5;
2263 s5 -= carry5 * ((uint64_t) 1L << 21);
2264
2265 carry6 = s6 >> 21;
2266 s7 += carry6;
2267 s6 -= carry6 * ((uint64_t) 1L << 21);
2268
2269 carry7 = s7 >> 21;
2270 s8 += carry7;
2271 s7 -= carry7 * ((uint64_t) 1L << 21);
2272
2273 carry8 = s8 >> 21;
2274 s9 += carry8;
2275 s8 -= carry8 * ((uint64_t) 1L << 21);
2276
2277 carry9 = s9 >> 21;
2278 s10 += carry9;
2279 s9 -= carry9 * ((uint64_t) 1L << 21);
2280
2281 carry10 = s10 >> 21;
2282 s11 += carry10;
2283 s10 -= carry10 * ((uint64_t) 1L << 21);
2284
2285 carry11 = s11 >> 21;
2286 s12 += carry11;
2287 s11 -= carry11 * ((uint64_t) 1L << 21);
2288
2289 s0 += s12 * 666643;
2290 s1 += s12 * 470296;
2291 s2 += s12 * 654183;
2292 s3 -= s12 * 997805;
2293 s4 += s12 * 136657;
2294 s5 -= s12 * 683901;
2295
2296 carry0 = s0 >> 21;
2297 s1 += carry0;
2298 s0 -= carry0 * ((uint64_t) 1L << 21);
2299
2300 carry1 = s1 >> 21;
2301 s2 += carry1;
2302 s1 -= carry1 * ((uint64_t) 1L << 21);
2303
2304 carry2 = s2 >> 21;
2305 s3 += carry2;
2306 s2 -= carry2 * ((uint64_t) 1L << 21);
2307
2308 carry3 = s3 >> 21;
2309 s4 += carry3;
2310 s3 -= carry3 * ((uint64_t) 1L << 21);
2311
2312 carry4 = s4 >> 21;
2313 s5 += carry4;
2314 s4 -= carry4 * ((uint64_t) 1L << 21);
2315
2316 carry5 = s5 >> 21;
2317 s6 += carry5;
2318 s5 -= carry5 * ((uint64_t) 1L << 21);
2319
2320 carry6 = s6 >> 21;
2321 s7 += carry6;
2322 s6 -= carry6 * ((uint64_t) 1L << 21);
2323
2324 carry7 = s7 >> 21;
2325 s8 += carry7;
2326 s7 -= carry7 * ((uint64_t) 1L << 21);
2327
2328 carry8 = s8 >> 21;
2329 s9 += carry8;
2330 s8 -= carry8 * ((uint64_t) 1L << 21);
2331
2332 carry9 = s9 >> 21;
2333 s10 += carry9;
2334 s9 -= carry9 * ((uint64_t) 1L << 21);
2335
2336 carry10 = s10 >> 21;
2337 s11 += carry10;
2338 s10 -= carry10 * ((uint64_t) 1L << 21);
2339
2340 s[0] = s0 >> 0;
2341 s[1] = s0 >> 8;
2342 s[2] = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
2343 s[3] = s1 >> 3;
2344 s[4] = s1 >> 11;
2345 s[5] = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
2346 s[6] = s2 >> 6;
2347 s[7] = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
2348 s[8] = s3 >> 1;
2349 s[9] = s3 >> 9;
2350 s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
2351 s[11] = s4 >> 4;
2352 s[12] = s4 >> 12;
2353 s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
2354 s[14] = s5 >> 7;
2355 s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
2356 s[16] = s6 >> 2;
2357 s[17] = s6 >> 10;
2358 s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
2359 s[19] = s7 >> 5;
2360 s[20] = s7 >> 13;
2361 s[21] = s8 >> 0;
2362 s[22] = s8 >> 8;
2363 s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
2364 s[24] = s9 >> 3;
2365 s[25] = s9 >> 11;
2366 s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
2367 s[27] = s10 >> 6;
2368 s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
2369 s[29] = s11 >> 1;
2370 s[30] = s11 >> 9;
2371 s[31] = s11 >> 17;
2372 }
2373
2374 /**
2375 * Input:
2376 * s[0]+256*s[1]+...+256^63*s[63] = s
2377 *
2378 * Output:
2379 * s[0]+256*s[1]+...+256^31*s[31] = s mod l
2380 * where l = 2^252 + 27742317777372353535851937790883648493.
2381 * Overwrites s in place.
2382 */
2383 void sc_reduce(uint8_t *s)
2384 {
2385 int64_t s0 = 2097151 & load_3(s);
2386 int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
2387 int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
2388 int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
2389 int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
2390 int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
2391 int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
2392 int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
2393 int64_t s8 = 2097151 & load_3(s + 21);
2394 int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
2395 int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
2396 int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
2397 int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
2398 int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
2399 int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
2400 int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
2401 int64_t s16 = 2097151 & load_3(s + 42);
2402 int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
2403 int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
2404 int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
2405 int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
2406 int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
2407 int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
2408 int64_t s23 = (load_4(s + 60) >> 3);
2409
2410 int64_t carry0, carry1, carry2, carry3, carry4, carry5, carry6;
2411 int64_t carry7, carry8, carry9, carry10, carry11, carry12, carry13;
2412 int64_t carry14, carry15, carry16;
2413
2414 s11 += s23 * 666643;
2415 s12 += s23 * 470296;
2416 s13 += s23 * 654183;
2417 s14 -= s23 * 997805;
2418 s15 += s23 * 136657;
2419 s16 -= s23 * 683901;
2420
2421 s10 += s22 * 666643;
2422 s11 += s22 * 470296;
2423 s12 += s22 * 654183;
2424 s13 -= s22 * 997805;
2425 s14 += s22 * 136657;
2426 s15 -= s22 * 683901;
2427
2428 s9 += s21 * 666643;
2429 s10 += s21 * 470296;
2430 s11 += s21 * 654183;
2431 s12 -= s21 * 997805;
2432 s13 += s21 * 136657;
2433 s14 -= s21 * 683901;
2434
2435 s8 += s20 * 666643;
2436 s9 += s20 * 470296;
2437 s10 += s20 * 654183;
2438 s11 -= s20 * 997805;
2439 s12 += s20 * 136657;
2440 s13 -= s20 * 683901;
2441
2442 s7 += s19 * 666643;
2443 s8 += s19 * 470296;
2444 s9 += s19 * 654183;
2445 s10 -= s19 * 997805;
2446 s11 += s19 * 136657;
2447 s12 -= s19 * 683901;
2448
2449 s6 += s18 * 666643;
2450 s7 += s18 * 470296;
2451 s8 += s18 * 654183;
2452 s9 -= s18 * 997805;
2453 s10 += s18 * 136657;
2454 s11 -= s18 * 683901;
2455
2456 carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2457 s7 += carry6;
2458 s6 -= carry6 * ((uint64_t) 1L << 21);
2459
2460 carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2461 s9 += carry8;
2462 s8 -= carry8 * ((uint64_t) 1L << 21);
2463
2464 carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2465 s11 += carry10;
2466 s10 -= carry10 * ((uint64_t) 1L << 21);
2467
2468 carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
2469 s13 += carry12;
2470 s12 -= carry12 * ((uint64_t) 1L << 21);
2471
2472 carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
2473 s15 += carry14;
2474 s14 -= carry14 * ((uint64_t) 1L << 21);
2475
2476 carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
2477 s17 += carry16;
2478 s16 -= carry16 * ((uint64_t) 1L << 21);
2479
2480 carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2481 s8 += carry7;
2482 s7 -= carry7 * ((uint64_t) 1L << 21);
2483
2484 carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2485 s10 += carry9;
2486 s9 -= carry9 * ((uint64_t) 1L << 21);
2487
2488 carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2489 s12 += carry11;
2490 s11 -= carry11 * ((uint64_t) 1L << 21);
2491
2492 carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
2493 s14 += carry13;
2494 s13 -= carry13 * ((uint64_t) 1L << 21);
2495
2496 carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
2497 s16 += carry15;
2498 s15 -= carry15 * ((uint64_t) 1L << 21);
2499
2500 s5 += s17 * 666643;
2501 s6 += s17 * 470296;
2502 s7 += s17 * 654183;
2503 s8 -= s17 * 997805;
2504 s9 += s17 * 136657;
2505 s10 -= s17 * 683901;
2506
2507 s4 += s16 * 666643;
2508 s5 += s16 * 470296;
2509 s6 += s16 * 654183;
2510 s7 -= s16 * 997805;
2511 s8 += s16 * 136657;
2512 s9 -= s16 * 683901;
2513
2514 s3 += s15 * 666643;
2515 s4 += s15 * 470296;
2516 s5 += s15 * 654183;
2517 s6 -= s15 * 997805;
2518 s7 += s15 * 136657;
2519 s8 -= s15 * 683901;
2520
2521 s2 += s14 * 666643;
2522 s3 += s14 * 470296;
2523 s4 += s14 * 654183;
2524 s5 -= s14 * 997805;
2525 s6 += s14 * 136657;
2526 s7 -= s14 * 683901;
2527
2528 s1 += s13 * 666643;
2529 s2 += s13 * 470296;
2530 s3 += s13 * 654183;
2531 s4 -= s13 * 997805;
2532 s5 += s13 * 136657;
2533 s6 -= s13 * 683901;
2534
2535 s0 += s12 * 666643;
2536 s1 += s12 * 470296;
2537 s2 += s12 * 654183;
2538 s3 -= s12 * 997805;
2539 s4 += s12 * 136657;
2540 s5 -= s12 * 683901;
2541 s12 = 0;
2542
2543 carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
2544 s1 += carry0;
2545 s0 -= carry0 * ((uint64_t) 1L << 21);
2546
2547 carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
2548 s3 += carry2;
2549 s2 -= carry2 * ((uint64_t) 1L << 21);
2550
2551 carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
2552 s5 += carry4;
2553 s4 -= carry4 * ((uint64_t) 1L << 21);
2554
2555 carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
2556 s7 += carry6;
2557 s6 -= carry6 * ((uint64_t) 1L << 21);
2558
2559 carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
2560 s9 += carry8;
2561 s8 -= carry8 * ((uint64_t) 1L << 21);
2562
2563 carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
2564 s11 += carry10;
2565 s10 -= carry10 * ((uint64_t) 1L << 21);
2566
2567 carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
2568 s2 += carry1;
2569 s1 -= carry1 * ((uint64_t) 1L << 21);
2570
2571 carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
2572 s4 += carry3;
2573 s3 -= carry3 * ((uint64_t) 1L << 21);
2574
2575 carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
2576 s6 += carry5;
2577 s5 -= carry5 * ((uint64_t) 1L << 21);
2578
2579 carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
2580 s8 += carry7;
2581 s7 -= carry7 * ((uint64_t) 1L << 21);
2582
2583 carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
2584 s10 += carry9;
2585 s9 -= carry9 * ((uint64_t) 1L << 21);
2586
2587 carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
2588 s12 += carry11;
2589 s11 -= carry11 * ((uint64_t) 1L << 21);
2590
2591 s0 += s12 * 666643;
2592 s1 += s12 * 470296;
2593 s2 += s12 * 654183;
2594 s3 -= s12 * 997805;
2595 s4 += s12 * 136657;
2596 s5 -= s12 * 683901;
2597 s12 = 0;
2598
2599 carry0 = s0 >> 21;
2600 s1 += carry0;
2601 s0 -= carry0 * ((uint64_t) 1L << 21);
2602
2603 carry1 = s1 >> 21;
2604 s2 += carry1;
2605 s1 -= carry1 * ((uint64_t) 1L << 21);
2606
2607 carry2 = s2 >> 21;
2608 s3 += carry2;
2609 s2 -= carry2 * ((uint64_t) 1L << 21);
2610
2611 carry3 = s3 >> 21;
2612 s4 += carry3;
2613 s3 -= carry3 * ((uint64_t) 1L << 21);
2614
2615 carry4 = s4 >> 21;
2616 s5 += carry4;
2617 s4 -= carry4 * ((uint64_t) 1L << 21);
2618
2619 carry5 = s5 >> 21;
2620 s6 += carry5;
2621 s5 -= carry5 * ((uint64_t) 1L << 21);
2622
2623 carry6 = s6 >> 21;
2624 s7 += carry6;
2625 s6 -= carry6 * ((uint64_t) 1L << 21);
2626
2627 carry7 = s7 >> 21;
2628 s8 += carry7;
2629 s7 -= carry7 * ((uint64_t) 1L << 21);
2630
2631 carry8 = s8 >> 21;
2632 s9 += carry8;
2633 s8 -= carry8 * ((uint64_t) 1L << 21);
2634
2635 carry9 = s9 >> 21;
2636 s10 += carry9;
2637 s9 -= carry9 * ((uint64_t) 1L << 21);
2638
2639 carry10 = s10 >> 21;
2640 s11 += carry10;
2641 s10 -= carry10 * ((uint64_t) 1L << 21);
2642
2643 carry11 = s11 >> 21;
2644 s12 += carry11;
2645 s11 -= carry11 * ((uint64_t) 1L << 21);
2646
2647 s0 += s12 * 666643;
2648 s1 += s12 * 470296;
2649 s2 += s12 * 654183;
2650 s3 -= s12 * 997805;
2651 s4 += s12 * 136657;
2652 s5 -= s12 * 683901;
2653
2654 carry0 = s0 >> 21;
2655 s1 += carry0;
2656 s0 -= carry0 * ((uint64_t) 1L << 21);
2657
2658 carry1 = s1 >> 21;
2659 s2 += carry1;
2660 s1 -= carry1 * ((uint64_t) 1L << 21);
2661
2662 carry2 = s2 >> 21;
2663 s3 += carry2;
2664 s2 -= carry2 * ((uint64_t) 1L << 21);
2665
2666 carry3 = s3 >> 21;
2667 s4 += carry3;
2668 s3 -= carry3 * ((uint64_t) 1L << 21);
2669
2670 carry4 = s4 >> 21;
2671 s5 += carry4;
2672 s4 -= carry4 * ((uint64_t) 1L << 21);
2673
2674 carry5 = s5 >> 21;
2675 s6 += carry5;
2676 s5 -= carry5 * ((uint64_t) 1L << 21);
2677
2678 carry6 = s6 >> 21;
2679 s7 += carry6;
2680 s6 -= carry6 * ((uint64_t) 1L << 21);
2681
2682 carry7 = s7 >> 21;
2683 s8 += carry7;
2684 s7 -= carry7 * ((uint64_t) 1L << 21);
2685
2686 carry8 = s8 >> 21;
2687 s9 += carry8;
2688 s8 -= carry8 * ((uint64_t) 1L << 21);
2689
2690 carry9 = s9 >> 21;
2691 s10 += carry9;
2692 s9 -= carry9 * ((uint64_t) 1L << 21);
2693
2694 carry10 = s10 >> 21;
2695 s11 += carry10;
2696 s10 -= carry10 * ((uint64_t) 1L << 21);
2697
2698 s[0] = s0 >> 0;
2699 s[1] = s0 >> 8;
2700 s[2] = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
2701 s[3] = s1 >> 3;
2702 s[4] = s1 >> 11;
2703 s[5] = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
2704 s[6] = s2 >> 6;
2705 s[7] = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
2706 s[8] = s3 >> 1;
2707 s[9] = s3 >> 9;
2708 s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
2709 s[11] = s4 >> 4;
2710 s[12] = s4 >> 12;
2711 s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
2712 s[14] = s5 >> 7;
2713 s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
2714 s[16] = s6 >> 2;
2715 s[17] = s6 >> 10;
2716 s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
2717 s[19] = s7 >> 5;
2718 s[20] = s7 >> 13;
2719 s[21] = s8 >> 0;
2720 s[22] = s8 >> 8;
2721 s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
2722 s[24] = s9 >> 3;
2723 s[25] = s9 >> 11;
2724 s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
2725 s[27] = s10 >> 6;
2726 s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
2727 s[29] = s11 >> 1;
2728 s[30] = s11 >> 9;
2729 s[31] = s11 >> 17;
2730 }