]>
git.ipfire.org Git - thirdparty/strongswan.git/blob - src/libstrongswan/plugins/curve25519/ref10/ref10.c
2 * Copyright (C) 2016 Andreas Steffen
4 * Based on the public domain libsodium adaptation by Frank Denis
5 * of the SUPERCOP ref10 implementation by Daniel J. Bernstein,
6 * Niels Duif, Peter Schwabe, Tanja Lange and Bo-Yin Yang.
15 #include <utils/utils.h>
17 static uint64_t load_3(const uint8_t *in
)
21 result
= (uint64_t) in
[0];
22 result
|= ((uint64_t) in
[1]) << 8;
23 result
|= ((uint64_t) in
[2]) << 16;
28 static uint64_t load_4(const uint8_t *in
)
32 result
= (uint64_t) in
[0];
33 result
|= ((uint64_t) in
[1]) << 8;
34 result
|= ((uint64_t) in
[2]) << 16;
35 result
|= ((uint64_t) in
[3]) << 24;
43 static void fe_0(fe h
)
45 memset(&h
[0], 0, 10 * sizeof h
[0]);
51 static void fe_1(fe h
)
55 memset(&h
[2], 0, 8 * sizeof h
[0]);
60 * Can overlap h with f or g.
63 * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
64 * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
67 * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
69 static void fe_add(fe h
, const fe f
, const fe g
)
100 int32_t h7
= f7
+ g7
;
101 int32_t h8
= f8
+ g8
;
102 int32_t h9
= f9
+ g9
;
117 * Replace (f,g) with (g,g) if b == 1;
118 * replace (f,g) with (f,g) if b == 0.
120 * Preconditions: b in {0,1}.
122 static void fe_cmov(fe f
, const fe g
, unsigned int b
)
146 int32_t x0
= f0
^ g0
;
147 int32_t x1
= f1
^ g1
;
148 int32_t x2
= f2
^ g2
;
149 int32_t x3
= f3
^ g3
;
150 int32_t x4
= f4
^ g4
;
151 int32_t x5
= f5
^ g5
;
152 int32_t x6
= f6
^ g6
;
153 int32_t x7
= f7
^ g7
;
154 int32_t x8
= f8
^ g8
;
155 int32_t x9
= f9
^ g9
;
157 b
= (unsigned int) (- (int) b
);
185 static void fe_copy(fe h
, const fe f
)
211 * Ignores top bit of h.
213 static void fe_frombytes(fe h
, const uint8_t *s
)
215 int64_t h0
= load_4(s
);
216 int64_t h1
= load_3(s
+ 4) << 6;
217 int64_t h2
= load_3(s
+ 7) << 5;
218 int64_t h3
= load_3(s
+ 10) << 3;
219 int64_t h4
= load_3(s
+ 13) << 2;
220 int64_t h5
= load_4(s
+ 16);
221 int64_t h6
= load_3(s
+ 20) << 7;
222 int64_t h7
= load_3(s
+ 23) << 5;
223 int64_t h8
= load_3(s
+ 26) << 4;
224 int64_t h9
= (load_3(s
+ 29) & 8388607) << 2;
226 int64_t carry0
, carry1
, carry2
, carry3
, carry4
;
227 int64_t carry5
, carry6
, carry7
, carry8
, carry9
;
229 carry9
= (h9
+ (int64_t) (1L << 24)) >> 25;
231 h9
-= carry9
* ((uint64_t) 1L << 25);
233 carry1
= (h1
+ (int64_t) (1L << 24)) >> 25;
235 h1
-= carry1
* ((uint64_t) 1L << 25);
237 carry3
= (h3
+ (int64_t) (1L << 24)) >> 25;
239 h3
-= carry3
* ((uint64_t) 1L << 25);
241 carry5
= (h5
+ (int64_t) (1L << 24)) >> 25;
243 h5
-= carry5
* ((uint64_t) 1L << 25);
245 carry7
= (h7
+ (int64_t) (1L << 24)) >> 25;
247 h7
-= carry7
* ((uint64_t) 1L << 25);
249 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
251 h0
-= carry0
* ((uint64_t) 1L << 26);
253 carry2
= (h2
+ (int64_t) (1L << 25)) >> 26;
255 h2
-= carry2
* ((uint64_t) 1L << 26);
257 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
259 h4
-= carry4
* ((uint64_t) 1L << 26);
261 carry6
= (h6
+ (int64_t) (1L << 25)) >> 26;
263 h6
-= carry6
* ((uint64_t) 1L << 26);
265 carry8
= (h8
+ (int64_t) (1L << 25)) >> 26;
267 h8
-= carry8
* ((uint64_t) 1L << 26);
283 * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
285 * Write p=2^255-19; q=floor(h/p).
286 * Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
289 * Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
290 * Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
292 * Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
296 * Have 0<=r<=p-1=2^255-20.
297 * Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
299 * Write x=r+19(2^-255)r+y.
300 * Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
302 * Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
303 * so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
305 static void fe_tobytes(uint8_t *s
, const fe h
)
318 int32_t carry0
, carry1
, carry2
, carry3
, carry4
;
319 int32_t carry5
, carry6
, carry7
, carry8
, carry9
;
322 q
= (19 * h9
+ ((uint32_t) 1L << 24)) >> 25;
334 /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
336 /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
340 h0
-= carry0
* ((uint32_t) 1L << 26);
344 h1
-= carry1
* ((uint32_t) 1L << 25);
348 h2
-= carry2
* ((uint32_t) 1L << 26);
352 h3
-= carry3
* ((uint32_t) 1L << 25);
356 h4
-= carry4
* ((uint32_t) 1L << 26);
360 h5
-= carry5
* ((uint32_t) 1L << 25);
364 h6
-= carry6
* ((uint32_t) 1L << 26);
368 h7
-= carry7
* ((uint32_t) 1L << 25);
372 h8
-= carry8
* ((uint32_t) 1L << 26);
375 h9
-= carry9
* ((uint32_t) 1L << 25);
379 * Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
380 * Have h0+...+2^230 h9 between 0 and 2^255-1;
381 * evidently 2^255 h10-2^255 q = 0.
382 * Goal: Output h0+...+2^230 h9.
387 s
[3] = (h0
>> 24) | (h1
* ((uint32_t) 1 << 2));
390 s
[6] = (h1
>> 22) | (h2
* ((uint32_t) 1 << 3));
393 s
[9] = (h2
>> 21) | (h3
* ((uint32_t) 1 << 5));
396 s
[12] = (h3
>> 19) | (h4
* ((uint32_t) 1 << 6));
403 s
[19] = (h5
>> 24) | (h6
* ((uint32_t) 1 << 1));
406 s
[22] = (h6
>> 23) | (h7
* ((uint32_t) 1 << 3));
409 s
[25] = (h7
>> 21) | (h8
* ((uint32_t) 1 << 4));
412 s
[28] = (h8
>> 20) | (h9
* ((uint32_t) 1 << 6));
419 * return 1 if f is in {1,3,5,...,q-2}
420 * return 0 if f is in {0,2,4,...,q-1}
423 * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
425 static int fe_isnegative(const fe f
)
439 * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25,etc.
441 static uint8_t zero
[32];
443 static int fe_isnonzero(const fe f
)
449 return !memeq_const(s
, zero
, 32);
454 * Can overlap h with f or g.
457 * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
458 * |g| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
461 * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
465 * Notes on implementation strategy:
467 * Using schoolbook multiplication.
468 * Karatsuba would save a little in some cost models.
470 * Most multiplications by 2 and 19 are 32-bit precomputations;
471 * cheaper than 64-bit postcomputations.
473 * There is one remaining multiplication by 19 in the carry chain;
474 * one *19 precomputation can be merged into this,
475 * but the resulting data flow is considerably less clean.
477 * There are 12 carries below.
478 * 10 of them are 2-way parallelizable and vectorizable.
479 * Can get away with 11 carries, but then data flow is much deeper.
481 * With tighter constraints on inputs can squeeze carries into int32.
484 static void fe_mul(fe h
, const fe f
, const fe g
)
508 int32_t g1_19
= 19 * g1
; /* 1.959375*2^29 */
509 int32_t g2_19
= 19 * g2
; /* 1.959375*2^30; still ok */
510 int32_t g3_19
= 19 * g3
;
511 int32_t g4_19
= 19 * g4
;
512 int32_t g5_19
= 19 * g5
;
513 int32_t g6_19
= 19 * g6
;
514 int32_t g7_19
= 19 * g7
;
515 int32_t g8_19
= 19 * g8
;
516 int32_t g9_19
= 19 * g9
;
518 int32_t f1_2
= 2 * f1
;
519 int32_t f3_2
= 2 * f3
;
520 int32_t f5_2
= 2 * f5
;
521 int32_t f7_2
= 2 * f7
;
522 int32_t f9_2
= 2 * f9
;
524 int64_t f0g0
= f0
* (int64_t) g0
;
525 int64_t f0g1
= f0
* (int64_t) g1
;
526 int64_t f0g2
= f0
* (int64_t) g2
;
527 int64_t f0g3
= f0
* (int64_t) g3
;
528 int64_t f0g4
= f0
* (int64_t) g4
;
529 int64_t f0g5
= f0
* (int64_t) g5
;
530 int64_t f0g6
= f0
* (int64_t) g6
;
531 int64_t f0g7
= f0
* (int64_t) g7
;
532 int64_t f0g8
= f0
* (int64_t) g8
;
533 int64_t f0g9
= f0
* (int64_t) g9
;
535 int64_t f1g0
= f1
* (int64_t) g0
;
536 int64_t f1g1_2
= f1_2
* (int64_t) g1
;
537 int64_t f1g2
= f1
* (int64_t) g2
;
538 int64_t f1g3_2
= f1_2
* (int64_t) g3
;
539 int64_t f1g4
= f1
* (int64_t) g4
;
540 int64_t f1g5_2
= f1_2
* (int64_t) g5
;
541 int64_t f1g6
= f1
* (int64_t) g6
;
542 int64_t f1g7_2
= f1_2
* (int64_t) g7
;
543 int64_t f1g8
= f1
* (int64_t) g8
;
544 int64_t f1g9_38
= f1_2
* (int64_t) g9_19
;
546 int64_t f2g0
= f2
* (int64_t) g0
;
547 int64_t f2g1
= f2
* (int64_t) g1
;
548 int64_t f2g2
= f2
* (int64_t) g2
;
549 int64_t f2g3
= f2
* (int64_t) g3
;
550 int64_t f2g4
= f2
* (int64_t) g4
;
551 int64_t f2g5
= f2
* (int64_t) g5
;
552 int64_t f2g6
= f2
* (int64_t) g6
;
553 int64_t f2g7
= f2
* (int64_t) g7
;
554 int64_t f2g8_19
= f2
* (int64_t) g8_19
;
555 int64_t f2g9_19
= f2
* (int64_t) g9_19
;
557 int64_t f3g0
= f3
* (int64_t) g0
;
558 int64_t f3g1_2
= f3_2
* (int64_t) g1
;
559 int64_t f3g2
= f3
* (int64_t) g2
;
560 int64_t f3g3_2
= f3_2
* (int64_t) g3
;
561 int64_t f3g4
= f3
* (int64_t) g4
;
562 int64_t f3g5_2
= f3_2
* (int64_t) g5
;
563 int64_t f3g6
= f3
* (int64_t) g6
;
564 int64_t f3g7_38
= f3_2
* (int64_t) g7_19
;
565 int64_t f3g8_19
= f3
* (int64_t) g8_19
;
566 int64_t f3g9_38
= f3_2
* (int64_t) g9_19
;
568 int64_t f4g0
= f4
* (int64_t) g0
;
569 int64_t f4g1
= f4
* (int64_t) g1
;
570 int64_t f4g2
= f4
* (int64_t) g2
;
571 int64_t f4g3
= f4
* (int64_t) g3
;
572 int64_t f4g4
= f4
* (int64_t) g4
;
573 int64_t f4g5
= f4
* (int64_t) g5
;
574 int64_t f4g6_19
= f4
* (int64_t) g6_19
;
575 int64_t f4g7_19
= f4
* (int64_t) g7_19
;
576 int64_t f4g8_19
= f4
* (int64_t) g8_19
;
577 int64_t f4g9_19
= f4
* (int64_t) g9_19
;
579 int64_t f5g0
= f5
* (int64_t) g0
;
580 int64_t f5g1_2
= f5_2
* (int64_t) g1
;
581 int64_t f5g2
= f5
* (int64_t) g2
;
582 int64_t f5g3_2
= f5_2
* (int64_t) g3
;
583 int64_t f5g4
= f5
* (int64_t) g4
;
584 int64_t f5g5_38
= f5_2
* (int64_t) g5_19
;
585 int64_t f5g6_19
= f5
* (int64_t) g6_19
;
586 int64_t f5g7_38
= f5_2
* (int64_t) g7_19
;
587 int64_t f5g8_19
= f5
* (int64_t) g8_19
;
588 int64_t f5g9_38
= f5_2
* (int64_t) g9_19
;
590 int64_t f6g0
= f6
* (int64_t) g0
;
591 int64_t f6g1
= f6
* (int64_t) g1
;
592 int64_t f6g2
= f6
* (int64_t) g2
;
593 int64_t f6g3
= f6
* (int64_t) g3
;
594 int64_t f6g4_19
= f6
* (int64_t) g4_19
;
595 int64_t f6g5_19
= f6
* (int64_t) g5_19
;
596 int64_t f6g6_19
= f6
* (int64_t) g6_19
;
597 int64_t f6g7_19
= f6
* (int64_t) g7_19
;
598 int64_t f6g8_19
= f6
* (int64_t) g8_19
;
599 int64_t f6g9_19
= f6
* (int64_t) g9_19
;
601 int64_t f7g0
= f7
* (int64_t) g0
;
602 int64_t f7g1_2
= f7_2
* (int64_t) g1
;
603 int64_t f7g2
= f7
* (int64_t) g2
;
604 int64_t f7g3_38
= f7_2
* (int64_t) g3_19
;
605 int64_t f7g4_19
= f7
* (int64_t) g4_19
;
606 int64_t f7g5_38
= f7_2
* (int64_t) g5_19
;
607 int64_t f7g6_19
= f7
* (int64_t) g6_19
;
608 int64_t f7g7_38
= f7_2
* (int64_t) g7_19
;
609 int64_t f7g8_19
= f7
* (int64_t) g8_19
;
610 int64_t f7g9_38
= f7_2
* (int64_t) g9_19
;
612 int64_t f8g0
= f8
* (int64_t) g0
;
613 int64_t f8g1
= f8
* (int64_t) g1
;
614 int64_t f8g2_19
= f8
* (int64_t) g2_19
;
615 int64_t f8g3_19
= f8
* (int64_t) g3_19
;
616 int64_t f8g4_19
= f8
* (int64_t) g4_19
;
617 int64_t f8g5_19
= f8
* (int64_t) g5_19
;
618 int64_t f8g6_19
= f8
* (int64_t) g6_19
;
619 int64_t f8g7_19
= f8
* (int64_t) g7_19
;
620 int64_t f8g8_19
= f8
* (int64_t) g8_19
;
621 int64_t f8g9_19
= f8
* (int64_t) g9_19
;
623 int64_t f9g0
= f9
* (int64_t) g0
;
624 int64_t f9g1_38
= f9_2
* (int64_t) g1_19
;
625 int64_t f9g2_19
= f9
* (int64_t) g2_19
;
626 int64_t f9g3_38
= f9_2
* (int64_t) g3_19
;
627 int64_t f9g4_19
= f9
* (int64_t) g4_19
;
628 int64_t f9g5_38
= f9_2
* (int64_t) g5_19
;
629 int64_t f9g6_19
= f9
* (int64_t) g6_19
;
630 int64_t f9g7_38
= f9_2
* (int64_t) g7_19
;
631 int64_t f9g8_19
= f9
* (int64_t) g8_19
;
632 int64_t f9g9_38
= f9_2
* (int64_t) g9_19
;
634 int64_t h0
= f0g0
+ f1g9_38
+ f2g8_19
+ f3g7_38
+ f4g6_19
+ f5g5_38
+
635 f6g4_19
+ f7g3_38
+ f8g2_19
+ f9g1_38
;
636 int64_t h1
= f0g1
+ f1g0
+ f2g9_19
+ f3g8_19
+ f4g7_19
+ f5g6_19
+
637 f6g5_19
+ f7g4_19
+ f8g3_19
+ f9g2_19
;
638 int64_t h2
= f0g2
+ f1g1_2
+ f2g0
+ f3g9_38
+ f4g8_19
+ f5g7_38
+
639 f6g6_19
+ f7g5_38
+ f8g4_19
+ f9g3_38
;
640 int64_t h3
= f0g3
+ f1g2
+ f2g1
+ f3g0
+ f4g9_19
+ f5g8_19
+
641 f6g7_19
+ f7g6_19
+ f8g5_19
+ f9g4_19
;
642 int64_t h4
= f0g4
+ f1g3_2
+ f2g2
+ f3g1_2
+ f4g0
+ f5g9_38
+
643 f6g8_19
+ f7g7_38
+ f8g6_19
+ f9g5_38
;
644 int64_t h5
= f0g5
+ f1g4
+ f2g3
+ f3g2
+ f4g1
+ f5g0
+
645 f6g9_19
+ f7g8_19
+ f8g7_19
+ f9g6_19
;
646 int64_t h6
= f0g6
+ f1g5_2
+ f2g4
+ f3g3_2
+ f4g2
+ f5g1_2
+
647 f6g0
+ f7g9_38
+ f8g8_19
+ f9g7_38
;
648 int64_t h7
= f0g7
+ f1g6
+ f2g5
+ f3g4
+ f4g3
+ f5g2
+
649 f6g1
+ f7g0
+ f8g9_19
+ f9g8_19
;
650 int64_t h8
= f0g8
+ f1g7_2
+ f2g6
+ f3g5_2
+ f4g4
+ f5g3_2
+
651 f6g2
+ f7g1_2
+ f8g0
+ f9g9_38
;
652 int64_t h9
= f0g9
+ f1g8
+ f2g7
+ f3g6
+ f4g5
+ f5g4
+
653 f6g3
+ f7g2
+ f8g1
+ f9g0
;
655 int64_t carry0
, carry1
, carry2
, carry3
, carry4
;
656 int64_t carry5
, carry6
, carry7
, carry8
, carry9
;
659 * |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
660 * i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
661 * |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
662 * i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
665 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
667 h0
-= carry0
* ((uint64_t) 1L << 26);
669 /* |h1| <= 1.71*2^59 */
671 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
673 h4
-= carry4
* ((uint64_t) 1L << 26);
675 /* |h5| <= 1.71*2^59 */
677 carry1
= (h1
+ (int64_t) (1L << 24)) >> 25;
679 h1
-= carry1
* ((uint64_t) 1L << 25);
680 /* |h1| <= 2^24; from now on fits into int32 */
681 /* |h2| <= 1.41*2^60 */
683 carry5
= (h5
+ (int64_t) (1L << 24)) >> 25;
685 h5
-= carry5
* ((uint64_t) 1L << 25);
686 /* |h5| <= 2^24; from now on fits into int32 */
687 /* |h6| <= 1.41*2^60 */
689 carry2
= (h2
+ (int64_t) (1L << 25)) >> 26;
691 h2
-= carry2
* ((uint64_t) 1L << 26);
692 /* |h2| <= 2^25; from now on fits into int32 unchanged */
693 /* |h3| <= 1.71*2^59 */
695 carry6
= (h6
+ (int64_t) (1L << 25)) >> 26;
697 h6
-= carry6
* ((uint64_t) 1L << 26);
698 /* |h6| <= 2^25; from now on fits into int32 unchanged */
699 /* |h7| <= 1.71*2^59 */
701 carry3
= (h3
+ (int64_t) (1L << 24)) >> 25;
703 h3
-= carry3
* ((uint64_t) 1L << 25);
704 /* |h3| <= 2^24; from now on fits into int32 unchanged */
705 /* |h4| <= 1.72*2^34 */
707 carry7
= (h7
+ (int64_t) (1L << 24)) >> 25;
709 h7
-= carry7
* ((uint64_t) 1L << 25);
710 /* |h7| <= 2^24; from now on fits into int32 unchanged */
711 /* |h8| <= 1.41*2^60 */
713 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
715 h4
-= carry4
* ((uint64_t) 1L << 26);
716 /* |h4| <= 2^25; from now on fits into int32 unchanged */
717 /* |h5| <= 1.01*2^24 */
719 carry8
= (h8
+ (int64_t) (1L << 25)) >> 26;
721 h8
-= carry8
* ((uint64_t) 1L << 26);
722 /* |h8| <= 2^25; from now on fits into int32 unchanged */
723 /* |h9| <= 1.71*2^59 */
725 carry9
= (h9
+ (int64_t) (1L << 24)) >> 25;
727 h9
-= carry9
* ((uint64_t) 1L << 25);
728 /* |h9| <= 2^24; from now on fits into int32 unchanged */
729 /* |h0| <= 1.1*2^39 */
731 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
733 h0
-= carry0
* ((uint64_t) 1L << 26);
734 /* |h0| <= 2^25; from now on fits into int32 unchanged */
735 /* |h1| <= 1.01*2^24 */
753 * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
756 * |h| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
758 static void fe_neg(fe h
,const fe f
)
796 * Can overlap h with f.
799 * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
802 * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
804 * See fe_mul.c for discussion of implementation strategy.
806 static void fe_sq(fe h
, const fe f
)
819 int32_t f0_2
= 2 * f0
;
820 int32_t f1_2
= 2 * f1
;
821 int32_t f2_2
= 2 * f2
;
822 int32_t f3_2
= 2 * f3
;
823 int32_t f4_2
= 2 * f4
;
824 int32_t f5_2
= 2 * f5
;
825 int32_t f6_2
= 2 * f6
;
826 int32_t f7_2
= 2 * f7
;
828 int32_t f5_38
= 38 * f5
; /* 1.959375*2^30 */
829 int32_t f6_19
= 19 * f6
; /* 1.959375*2^30 */
830 int32_t f7_38
= 38 * f7
; /* 1.959375*2^30 */
831 int32_t f8_19
= 19 * f8
; /* 1.959375*2^30 */
832 int32_t f9_38
= 38 * f9
; /* 1.959375*2^30 */
834 int64_t f0f0
= f0
* (int64_t) f0
;
835 int64_t f0f1_2
= f0_2
* (int64_t) f1
;
836 int64_t f0f2_2
= f0_2
* (int64_t) f2
;
837 int64_t f0f3_2
= f0_2
* (int64_t) f3
;
838 int64_t f0f4_2
= f0_2
* (int64_t) f4
;
839 int64_t f0f5_2
= f0_2
* (int64_t) f5
;
840 int64_t f0f6_2
= f0_2
* (int64_t) f6
;
841 int64_t f0f7_2
= f0_2
* (int64_t) f7
;
842 int64_t f0f8_2
= f0_2
* (int64_t) f8
;
843 int64_t f0f9_2
= f0_2
* (int64_t) f9
;
845 int64_t f1f1_2
= f1_2
* (int64_t) f1
;
846 int64_t f1f2_2
= f1_2
* (int64_t) f2
;
847 int64_t f1f3_4
= f1_2
* (int64_t) f3_2
;
848 int64_t f1f4_2
= f1_2
* (int64_t) f4
;
849 int64_t f1f5_4
= f1_2
* (int64_t) f5_2
;
850 int64_t f1f6_2
= f1_2
* (int64_t) f6
;
851 int64_t f1f7_4
= f1_2
* (int64_t) f7_2
;
852 int64_t f1f8_2
= f1_2
* (int64_t) f8
;
853 int64_t f1f9_76
= f1_2
* (int64_t) f9_38
;
855 int64_t f2f2
= f2
* (int64_t) f2
;
856 int64_t f2f3_2
= f2_2
* (int64_t) f3
;
857 int64_t f2f4_2
= f2_2
* (int64_t) f4
;
858 int64_t f2f5_2
= f2_2
* (int64_t) f5
;
859 int64_t f2f6_2
= f2_2
* (int64_t) f6
;
860 int64_t f2f7_2
= f2_2
* (int64_t) f7
;
861 int64_t f2f8_38
= f2_2
* (int64_t) f8_19
;
862 int64_t f2f9_38
= f2
* (int64_t) f9_38
;
864 int64_t f3f3_2
= f3_2
* (int64_t) f3
;
865 int64_t f3f4_2
= f3_2
* (int64_t) f4
;
866 int64_t f3f5_4
= f3_2
* (int64_t) f5_2
;
867 int64_t f3f6_2
= f3_2
* (int64_t) f6
;
868 int64_t f3f7_76
= f3_2
* (int64_t) f7_38
;
869 int64_t f3f8_38
= f3_2
* (int64_t) f8_19
;
870 int64_t f3f9_76
= f3_2
* (int64_t) f9_38
;
872 int64_t f4f4
= f4
* (int64_t) f4
;
873 int64_t f4f5_2
= f4_2
* (int64_t) f5
;
874 int64_t f4f6_38
= f4_2
* (int64_t) f6_19
;
875 int64_t f4f7_38
= f4
* (int64_t) f7_38
;
876 int64_t f4f8_38
= f4_2
* (int64_t) f8_19
;
877 int64_t f4f9_38
= f4
* (int64_t) f9_38
;
879 int64_t f5f5_38
= f5
* (int64_t) f5_38
;
880 int64_t f5f6_38
= f5_2
* (int64_t) f6_19
;
881 int64_t f5f7_76
= f5_2
* (int64_t) f7_38
;
882 int64_t f5f8_38
= f5_2
* (int64_t) f8_19
;
883 int64_t f5f9_76
= f5_2
* (int64_t) f9_38
;
885 int64_t f6f6_19
= f6
* (int64_t) f6_19
;
886 int64_t f6f7_38
= f6
* (int64_t) f7_38
;
887 int64_t f6f8_38
= f6_2
* (int64_t) f8_19
;
888 int64_t f6f9_38
= f6
* (int64_t) f9_38
;
890 int64_t f7f7_38
= f7
* (int64_t) f7_38
;
891 int64_t f7f8_38
= f7_2
* (int64_t) f8_19
;
892 int64_t f7f9_76
= f7_2
* (int64_t) f9_38
;
894 int64_t f8f8_19
= f8
* (int64_t) f8_19
;
895 int64_t f8f9_38
= f8
* (int64_t) f9_38
;
897 int64_t f9f9_38
= f9
* (int64_t) f9_38
;
899 int64_t h0
= f0f0
+ f1f9_76
+ f2f8_38
+ f3f7_76
+ f4f6_38
+ f5f5_38
;
900 int64_t h1
= f0f1_2
+ f2f9_38
+ f3f8_38
+ f4f7_38
+ f5f6_38
;
901 int64_t h2
= f0f2_2
+ f1f1_2
+ f3f9_76
+ f4f8_38
+ f5f7_76
+ f6f6_19
;
902 int64_t h3
= f0f3_2
+ f1f2_2
+ f4f9_38
+ f5f8_38
+ f6f7_38
;
903 int64_t h4
= f0f4_2
+ f1f3_4
+ f2f2
+ f5f9_76
+ f6f8_38
+ f7f7_38
;
904 int64_t h5
= f0f5_2
+ f1f4_2
+ f2f3_2
+ f6f9_38
+ f7f8_38
;
905 int64_t h6
= f0f6_2
+ f1f5_4
+ f2f4_2
+ f3f3_2
+ f7f9_76
+ f8f8_19
;
906 int64_t h7
= f0f7_2
+ f1f6_2
+ f2f5_2
+ f3f4_2
+ f8f9_38
;
907 int64_t h8
= f0f8_2
+ f1f7_4
+ f2f6_2
+ f3f5_4
+ f4f4
+ f9f9_38
;
908 int64_t h9
= f0f9_2
+ f1f8_2
+ f2f7_2
+ f3f6_2
+ f4f5_2
;
910 int64_t carry0
, carry1
, carry2
, carry3
, carry4
;
911 int64_t carry5
, carry6
, carry7
, carry8
, carry9
;
913 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
915 h0
-= carry0
* ((uint64_t) 1L << 26);
917 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
919 h4
-= carry4
* ((uint64_t) 1L << 26);
921 carry1
= (h1
+ (int64_t) (1L << 24)) >> 25;
923 h1
-= carry1
* ((uint64_t) 1L << 25);
925 carry5
= (h5
+ (int64_t) (1L << 24)) >> 25;
927 h5
-= carry5
* ((uint64_t) 1L << 25);
929 carry2
= (h2
+ (int64_t) (1L << 25)) >> 26;
931 h2
-= carry2
* ((uint64_t) 1L << 26);
933 carry6
= (h6
+ (int64_t) (1L << 25)) >> 26;
935 h6
-= carry6
* ((uint64_t) 1L << 26);
937 carry3
= (h3
+ (int64_t) (1L << 24)) >> 25;
939 h3
-= carry3
* ((uint64_t) 1L << 25);
941 carry7
= (h7
+ (int64_t) (1L << 24)) >> 25;
943 h7
-= carry7
* ((uint64_t) 1L << 25);
945 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
947 h4
-= carry4
* ((uint64_t) 1L << 26);
949 carry8
= (h8
+ (int64_t) (1L << 25)) >> 26;
951 h8
-= carry8
* ((uint64_t) 1L << 26);
953 carry9
= (h9
+ (int64_t) (1L << 24)) >> 25;
955 h9
-= carry9
* ((uint64_t) 1L << 25);
957 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
959 h0
-= carry0
* ((uint64_t) 1L << 26);
975 * Can overlap h with f.
978 *|f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
981 * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
983 * See fe_mul.c for discussion of implementation strategy.
985 static void fe_sq2(fe h
, const fe f
)
998 int32_t f0_2
= 2 * f0
;
999 int32_t f1_2
= 2 * f1
;
1000 int32_t f2_2
= 2 * f2
;
1001 int32_t f3_2
= 2 * f3
;
1002 int32_t f4_2
= 2 * f4
;
1003 int32_t f5_2
= 2 * f5
;
1004 int32_t f6_2
= 2 * f6
;
1005 int32_t f7_2
= 2 * f7
;
1007 int32_t f5_38
= 38 * f5
; /* 1.959375*2^30 */
1008 int32_t f6_19
= 19 * f6
; /* 1.959375*2^30 */
1009 int32_t f7_38
= 38 * f7
; /* 1.959375*2^30 */
1010 int32_t f8_19
= 19 * f8
; /* 1.959375*2^30 */
1011 int32_t f9_38
= 38 * f9
; /* 1.959375*2^30 */
1013 int64_t f0f0
= f0
* (int64_t) f0
;
1014 int64_t f0f1_2
= f0_2
* (int64_t) f1
;
1015 int64_t f0f2_2
= f0_2
* (int64_t) f2
;
1016 int64_t f0f3_2
= f0_2
* (int64_t) f3
;
1017 int64_t f0f4_2
= f0_2
* (int64_t) f4
;
1018 int64_t f0f5_2
= f0_2
* (int64_t) f5
;
1019 int64_t f0f6_2
= f0_2
* (int64_t) f6
;
1020 int64_t f0f7_2
= f0_2
* (int64_t) f7
;
1021 int64_t f0f8_2
= f0_2
* (int64_t) f8
;
1022 int64_t f0f9_2
= f0_2
* (int64_t) f9
;
1024 int64_t f1f1_2
= f1_2
* (int64_t) f1
;
1025 int64_t f1f2_2
= f1_2
* (int64_t) f2
;
1026 int64_t f1f3_4
= f1_2
* (int64_t) f3_2
;
1027 int64_t f1f4_2
= f1_2
* (int64_t) f4
;
1028 int64_t f1f5_4
= f1_2
* (int64_t) f5_2
;
1029 int64_t f1f6_2
= f1_2
* (int64_t) f6
;
1030 int64_t f1f7_4
= f1_2
* (int64_t) f7_2
;
1031 int64_t f1f8_2
= f1_2
* (int64_t) f8
;
1032 int64_t f1f9_76
= f1_2
* (int64_t) f9_38
;
1034 int64_t f2f2
= f2
* (int64_t) f2
;
1035 int64_t f2f3_2
= f2_2
* (int64_t) f3
;
1036 int64_t f2f4_2
= f2_2
* (int64_t) f4
;
1037 int64_t f2f5_2
= f2_2
* (int64_t) f5
;
1038 int64_t f2f6_2
= f2_2
* (int64_t) f6
;
1039 int64_t f2f7_2
= f2_2
* (int64_t) f7
;
1040 int64_t f2f8_38
= f2_2
* (int64_t) f8_19
;
1041 int64_t f2f9_38
= f2
* (int64_t) f9_38
;
1043 int64_t f3f3_2
= f3_2
* (int64_t) f3
;
1044 int64_t f3f4_2
= f3_2
* (int64_t) f4
;
1045 int64_t f3f5_4
= f3_2
* (int64_t) f5_2
;
1046 int64_t f3f6_2
= f3_2
* (int64_t) f6
;
1047 int64_t f3f7_76
= f3_2
* (int64_t) f7_38
;
1048 int64_t f3f8_38
= f3_2
* (int64_t) f8_19
;
1049 int64_t f3f9_76
= f3_2
* (int64_t) f9_38
;
1051 int64_t f4f4
= f4
* (int64_t) f4
;
1052 int64_t f4f5_2
= f4_2
* (int64_t) f5
;
1053 int64_t f4f6_38
= f4_2
* (int64_t) f6_19
;
1054 int64_t f4f7_38
= f4
* (int64_t) f7_38
;
1055 int64_t f4f8_38
= f4_2
* (int64_t) f8_19
;
1056 int64_t f4f9_38
= f4
* (int64_t) f9_38
;
1058 int64_t f5f5_38
= f5
* (int64_t) f5_38
;
1059 int64_t f5f6_38
= f5_2
* (int64_t) f6_19
;
1060 int64_t f5f7_76
= f5_2
* (int64_t) f7_38
;
1061 int64_t f5f8_38
= f5_2
* (int64_t) f8_19
;
1062 int64_t f5f9_76
= f5_2
* (int64_t) f9_38
;
1064 int64_t f6f6_19
= f6
* (int64_t) f6_19
;
1065 int64_t f6f7_38
= f6
* (int64_t) f7_38
;
1066 int64_t f6f8_38
= f6_2
* (int64_t) f8_19
;
1067 int64_t f6f9_38
= f6
* (int64_t) f9_38
;
1069 int64_t f7f7_38
= f7
* (int64_t) f7_38
;
1070 int64_t f7f8_38
= f7_2
* (int64_t) f8_19
;
1071 int64_t f7f9_76
= f7_2
* (int64_t) f9_38
;
1073 int64_t f8f8_19
= f8
* (int64_t) f8_19
;
1074 int64_t f8f9_38
= f8
* (int64_t) f9_38
;
1076 int64_t f9f9_38
= f9
* (int64_t) f9_38
;
1078 int64_t h0
= f0f0
+ f1f9_76
+ f2f8_38
+ f3f7_76
+ f4f6_38
+ f5f5_38
;
1079 int64_t h1
= f0f1_2
+ f2f9_38
+ f3f8_38
+ f4f7_38
+ f5f6_38
;
1080 int64_t h2
= f0f2_2
+ f1f1_2
+ f3f9_76
+ f4f8_38
+ f5f7_76
+ f6f6_19
;
1081 int64_t h3
= f0f3_2
+ f1f2_2
+ f4f9_38
+ f5f8_38
+ f6f7_38
;
1082 int64_t h4
= f0f4_2
+ f1f3_4
+ f2f2
+ f5f9_76
+ f6f8_38
+ f7f7_38
;
1083 int64_t h5
= f0f5_2
+ f1f4_2
+ f2f3_2
+ f6f9_38
+ f7f8_38
;
1084 int64_t h6
= f0f6_2
+ f1f5_4
+ f2f4_2
+ f3f3_2
+ f7f9_76
+ f8f8_19
;
1085 int64_t h7
= f0f7_2
+ f1f6_2
+ f2f5_2
+ f3f4_2
+ f8f9_38
;
1086 int64_t h8
= f0f8_2
+ f1f7_4
+ f2f6_2
+ f3f5_4
+ f4f4
+ f9f9_38
;
1087 int64_t h9
= f0f9_2
+ f1f8_2
+ f2f7_2
+ f3f6_2
+ f4f5_2
;
1089 int64_t carry0
, carry1
, carry2
, carry3
, carry4
;
1090 int64_t carry5
, carry6
, carry7
, carry8
, carry9
;
1103 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
1105 h0
-= carry0
* ((uint64_t) 1L << 26);
1107 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
1109 h4
-= carry4
* ((uint64_t) 1L << 26);
1111 carry1
= (h1
+ (int64_t) (1L << 24)) >> 25;
1113 h1
-= carry1
* ((uint64_t) 1L << 25);
1115 carry5
= (h5
+ (int64_t) (1L << 24)) >> 25;
1117 h5
-= carry5
* ((uint64_t) 1L << 25);
1119 carry2
= (h2
+ (int64_t) (1L << 25)) >> 26;
1121 h2
-= carry2
* ((uint64_t) 1L << 26);
1123 carry6
= (h6
+ (int64_t) (1L << 25)) >> 26;
1125 h6
-= carry6
* ((uint64_t) 1L << 26);
1127 carry3
= (h3
+ (int64_t) (1L << 24)) >> 25;
1129 h3
-= carry3
* ((uint64_t) 1L << 25);
1131 carry7
= (h7
+ (int64_t) (1L << 24)) >> 25;
1133 h7
-= carry7
* ((uint64_t) 1L << 25);
1135 carry4
= (h4
+ (int64_t) (1L << 25)) >> 26;
1137 h4
-= carry4
* ((uint64_t) 1L << 26);
1139 carry8
= (h8
+ (int64_t) (1L << 25)) >> 26;
1141 h8
-= carry8
* ((uint64_t) 1L << 26);
1143 carry9
= (h9
+ (int64_t) (1L << 24)) >> 25;
1145 h9
-= carry9
* ((uint64_t) 1L << 25);
1147 carry0
= (h0
+ (int64_t) (1L << 25)) >> 26;
1149 h0
-= carry0
* ((uint64_t) 1L << 26);
1151 h
[0] = (int32_t) h0
;
1152 h
[1] = (int32_t) h1
;
1153 h
[2] = (int32_t) h2
;
1154 h
[3] = (int32_t) h3
;
1155 h
[4] = (int32_t) h4
;
1156 h
[5] = (int32_t) h5
;
1157 h
[6] = (int32_t) h6
;
1158 h
[7] = (int32_t) h7
;
1159 h
[8] = (int32_t) h8
;
1160 h
[9] = (int32_t) h9
;
1163 static void fe_invert(fe out
, const fe z
)
1177 for (i
= 1; i
< 5; ++i
)
1185 for (i
= 1; i
< 10; ++i
)
1193 for (i
= 1; i
< 20; ++i
)
1201 for (i
= 1; i
< 10; ++i
)
1209 for (i
= 1; i
< 50; ++i
)
1217 for (i
= 1; i
< 100; ++i
)
1225 for (i
= 1; i
< 50; ++i
)
1233 for (i
= 1; i
< 5; ++i
)
1238 fe_mul(out
, t1
, t0
);
1241 static void fe_pow22523(fe out
, const fe z
)
1255 for (i
= 1; i
< 5; ++i
)
1263 for (i
= 1; i
< 10; ++i
)
1271 for (i
= 1; i
< 20; ++i
)
1279 for (i
= 1; i
< 10; ++i
)
1287 for (i
= 1; i
< 50; ++i
)
1294 for (i
= 1; i
< 100; ++i
)
1302 for (i
= 1; i
< 50; ++i
)
1315 * Can overlap h with f or g.
1318 * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
1319 * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
1322 * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
1324 static void fe_sub(fe h
, const fe f
, const fe g
)
1348 int32_t h0
= f0
- g0
;
1349 int32_t h1
= f1
- g1
;
1350 int32_t h2
= f2
- g2
;
1351 int32_t h3
= f3
- g3
;
1352 int32_t h4
= f4
- g4
;
1353 int32_t h5
= f5
- g5
;
1354 int32_t h6
= f6
- g6
;
1355 int32_t h7
= f7
- g7
;
1356 int32_t h8
= f8
- g8
;
1357 int32_t h9
= f9
- g9
;
1374 static void ge_add(ge_p1p1
*r
, const ge_p3
*p
, const ge_cached
*q
)
1378 fe_add(r
->X
, p
->Y
, p
->X
);
1379 fe_sub(r
->Y
, p
->Y
, p
->X
);
1380 fe_mul(r
->Z
, r
->X
, q
->YplusX
);
1381 fe_mul(r
->Y
, r
->Y
, q
->YminusX
);
1382 fe_mul(r
->T
, q
->T2d
, p
->T
);
1383 fe_mul(r
->X
, p
->Z
, q
->Z
);
1384 fe_add(t0
, r
->X
, r
->X
);
1385 fe_sub(r
->X
, r
->Z
, r
->Y
);
1386 fe_add(r
->Y
, r
->Z
, r
->Y
);
1387 fe_add(r
->Z
, t0
, r
->T
);
1388 fe_sub(r
->T
, t0
, r
->T
);
1391 static void slide(int8_t *r
, const uint8_t *a
)
1395 for (i
= 0; i
< 256; ++i
)
1397 r
[i
] = 1 & (a
[i
>> 3] >> (i
& 7));
1400 for (i
= 0; i
< 256; ++i
)
1404 for (b
= 1; b
<= 6 && i
+ b
< 256; ++b
)
1408 if (r
[i
] + (r
[i
+ b
] << b
) <= 15)
1410 r
[i
] += r
[i
+ b
] << b
; r
[i
+ b
] = 0;
1412 else if (r
[i
] - (r
[i
+ b
] << b
) >= -15)
1414 r
[i
] -= r
[i
+ b
] << b
;
1416 for (k
= i
+ b
; k
< 256; ++k
)
1436 static const ge_precomp Bi
[8] = {
1440 /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
1441 static const fe d
= {
1442 -10913610, 13857413, -15372611, 6949391, 114729,
1443 -8787816, -6275908, -3247719, -18696448, -12055116
1447 static const fe sqrtm1
= {
1448 -32595792, -7943725, 9377950, 3500415, 12389472,
1449 -272473, -25146209, -2005654, 326686, 11406482
1452 int ge_frombytes_negate_vartime(ge_p3
*h
, const uint8_t *s
)
1454 fe u
, v
, v3
, vxx
, check
;
1456 fe_frombytes(h
->Y
,s
);
1460 fe_sub(u
,u
,h
->Z
); /* u = y^2-1 */
1461 fe_add(v
,v
,h
->Z
); /* v = dy^2+1 */
1464 fe_mul(v3
,v3
,v
); /* v3 = v^3 */
1466 fe_mul(h
->X
,h
->X
,v
);
1467 fe_mul(h
->X
,h
->X
,u
); /* x = uv^7 */
1469 fe_pow22523(h
->X
,h
->X
); /* x = (uv^7)^((q-5)/8) */
1470 fe_mul(h
->X
,h
->X
,v3
);
1471 fe_mul(h
->X
,h
->X
,u
); /* x = uv^3(uv^7)^((q-5)/8) */
1475 fe_sub(check
,vxx
,u
); /* vx^2-u */
1477 if (fe_isnonzero(check
))
1479 fe_add(check
,vxx
,u
); /* vx^2+u */
1481 if (fe_isnonzero(check
))
1485 fe_mul(h
->X
,h
->X
,sqrtm1
);
1488 if (fe_isnegative(h
->X
) == (s
[31] >> 7))
1492 fe_mul(h
->T
,h
->X
,h
->Y
);
1500 static void ge_madd(ge_p1p1
*r
, const ge_p3
*p
, const ge_precomp
*q
)
1504 fe_add(r
->X
, p
->Y
, p
->X
);
1505 fe_sub(r
->Y
, p
->Y
, p
->X
);
1506 fe_mul(r
->Z
, r
->X
, q
->yplusx
);
1507 fe_mul(r
->Y
, r
->Y
, q
->yminusx
);
1508 fe_mul(r
->T
, q
->xy2d
, p
->T
);
1509 fe_add(t0
, p
->Z
, p
->Z
);
1510 fe_sub(r
->X
, r
->Z
, r
->Y
);
1511 fe_add(r
->Y
, r
->Z
, r
->Y
);
1512 fe_add(r
->Z
, t0
, r
->T
);
1513 fe_sub(r
->T
, t0
, r
->T
);
1519 static void ge_msub(ge_p1p1
*r
, const ge_p3
*p
, const ge_precomp
*q
)
1523 fe_add(r
->X
, p
->Y
, p
->X
);
1524 fe_sub(r
->Y
, p
->Y
, p
->X
);
1525 fe_mul(r
->Z
, r
->X
, q
->yminusx
);
1526 fe_mul(r
->Y
, r
->Y
, q
->yplusx
);
1527 fe_mul(r
->T
, q
->xy2d
, p
->T
);
1528 fe_add(t0
, p
->Z
, p
->Z
);
1529 fe_sub(r
->X
, r
->Z
, r
->Y
);
1530 fe_add(r
->Y
, r
->Z
, r
->Y
);
1531 fe_sub(r
->Z
, t0
, r
->T
);
1532 fe_add(r
->T
, t0
, r
->T
);
1538 static void ge_p1p1_to_p2(ge_p2
*r
, const ge_p1p1
*p
)
1540 fe_mul(r
->X
,p
->X
,p
->T
);
1541 fe_mul(r
->Y
,p
->Y
,p
->Z
);
1542 fe_mul(r
->Z
,p
->Z
,p
->T
);
1548 static void ge_p1p1_to_p3(ge_p3
*r
, const ge_p1p1
*p
)
1550 fe_mul(r
->X
,p
->X
,p
->T
);
1551 fe_mul(r
->Y
,p
->Y
,p
->Z
);
1552 fe_mul(r
->Z
,p
->Z
,p
->T
);
1553 fe_mul(r
->T
,p
->X
,p
->Y
);
1556 static void ge_p2_0(ge_p2
*h
)
1566 static void ge_p2_dbl(ge_p1p1
*r
, const ge_p2
*p
)
1573 fe_add(r
->Y
, p
->X
, p
->Y
);
1575 fe_add(r
->Y
, r
->Z
, r
->X
);
1576 fe_sub(r
->Z
, r
->Z
, r
->X
);
1577 fe_sub(r
->X
, t0
, r
->Y
);
1578 fe_sub(r
->T
, r
->T
, r
->Z
);
1581 static void ge_p3_0(ge_p3
*h
)
1593 /* 2 * d = 16295367250680780974490674513165176452449235426866156013048779062215315747161 */
1594 static const fe d2
= {
1595 -21827239, -5839606, -30745221, 13898782, 229458,
1596 15978800, -12551817, -6495438, 29715968, 9444199
1599 static void ge_p3_to_cached(ge_cached
*r
, const ge_p3
*p
)
1601 fe_add(r
->YplusX
,p
->Y
,p
->X
);
1602 fe_sub(r
->YminusX
,p
->Y
,p
->X
);
1604 fe_mul(r
->T2d
,p
->T
,d2
);
1610 static void ge_p3_to_p2(ge_p2
*r
, const ge_p3
*p
)
1617 void ge_p3_tobytes(uint8_t *s
, const ge_p3
*h
)
1621 fe_invert(recip
,h
->Z
);
1622 fe_mul(x
,h
->X
,recip
);
1623 fe_mul(y
,h
->Y
,recip
);
1626 s
[31] ^= fe_isnegative(x
) << 7;
1632 static void ge_p3_dbl(ge_p1p1
*r
, const ge_p3
*p
)
1639 static void ge_precomp_0(ge_precomp
*h
)
1646 static uint8_t equal(int8_t b
, int8_t c
)
1650 uint8_t x
= ub
^ uc
; /* 0: yes; 1..255: no */
1651 uint32_t y
= x
; /* 0: yes; 1..255: no */
1653 y
-= 1; /* 4294967295: yes; 0..254: no */
1654 y
>>= 31; /* 1: yes; 0: no */
1659 static uint8_t negative(int8_t b
)
1661 uint64_t x
= b
; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
1663 x
>>= 63; /* 1: yes; 0: no */
1668 static void cmov(ge_precomp
*t
, const ge_precomp
*u
, uint8_t b
)
1670 fe_cmov(t
->yplusx
,u
->yplusx
,b
);
1671 fe_cmov(t
->yminusx
,u
->yminusx
,b
);
1672 fe_cmov(t
->xy2d
,u
->xy2d
,b
);
1676 * base[i][j] = (j+1)*256^i*B
1678 static const ge_precomp base
[32][8] = {
1682 static void ge_select(ge_precomp
*t
, int pos
, int8_t b
)
1685 uint8_t bnegative
= negative(b
);
1686 uint8_t babs
= b
- (((-bnegative
) & b
) * ((int8_t) 1 << 1));
1689 cmov(t
,&base
[pos
][0],equal(babs
,1));
1690 cmov(t
,&base
[pos
][1],equal(babs
,2));
1691 cmov(t
,&base
[pos
][2],equal(babs
,3));
1692 cmov(t
,&base
[pos
][3],equal(babs
,4));
1693 cmov(t
,&base
[pos
][4],equal(babs
,5));
1694 cmov(t
,&base
[pos
][5],equal(babs
,6));
1695 cmov(t
,&base
[pos
][6],equal(babs
,7));
1696 cmov(t
,&base
[pos
][7],equal(babs
,8));
1697 fe_copy(minust
.yplusx
,t
->yminusx
);
1698 fe_copy(minust
.yminusx
,t
->yplusx
);
1699 fe_neg(minust
.xy2d
,t
->xy2d
);
1700 cmov(t
,&minust
,bnegative
);
1706 static void ge_sub(ge_p1p1
*r
, const ge_p3
*p
, const ge_cached
*q
)
1710 fe_add(r
->X
, p
->Y
, p
->X
);
1711 fe_sub(r
->Y
, p
->Y
, p
->X
);
1712 fe_mul(r
->Z
, r
->X
, q
->YminusX
);
1713 fe_mul(r
->Y
, r
->Y
, q
->YplusX
);
1714 fe_mul(r
->T
, q
->T2d
, p
->T
);
1715 fe_mul(r
->X
, p
->Z
, q
->Z
);
1716 fe_add(t0
, r
->X
, r
->X
);
1717 fe_sub(r
->X
, r
->Z
, r
->Y
);
1718 fe_add(r
->Y
, r
->Z
, r
->Y
);
1719 fe_sub(r
->Z
, t0
, r
->T
);
1720 fe_add(r
->T
, t0
, r
->T
);
1723 void ge_tobytes(uint8_t *s
, const ge_p2
*h
)
1727 fe_invert(recip
,h
->Z
);
1728 fe_mul(x
,h
->X
,recip
);
1729 fe_mul(y
,h
->Y
,recip
);
1732 s
[31] ^= fe_isnegative(x
) << 7;
1737 * where a = a[0]+256*a[1]+...+256^31 a[31]
1738 * B is the Ed25519 base point (x,4/5) with x positive.
1746 * where a = a[0]+256*a[1]+...+256^31 a[31].
1747 * and b = b[0]+256*b[1]+...+256^31 b[31].
1748 * B is the Ed25519 base point (x,4/5) with x positive.
1750 void ge_double_scalarmult_vartime(ge_p2
*r
, const uint8_t *a
, const ge_p3
*A
,
1755 ge_cached Ai
[8]; /* A,3A,5A,7A,9A,11A,13A,15A */
1763 ge_p3_to_cached(&Ai
[0],A
);
1765 ge_p1p1_to_p3(&A2
,&t
);
1767 ge_add(&t
,&A2
,&Ai
[0]);
1768 ge_p1p1_to_p3(&u
,&t
);
1769 ge_p3_to_cached(&Ai
[1],&u
);
1771 ge_add(&t
,&A2
,&Ai
[1]);
1772 ge_p1p1_to_p3(&u
,&t
);
1773 ge_p3_to_cached(&Ai
[2],&u
);
1775 ge_add(&t
,&A2
,&Ai
[2]);
1776 ge_p1p1_to_p3(&u
,&t
);
1777 ge_p3_to_cached(&Ai
[3],&u
);
1779 ge_add(&t
,&A2
,&Ai
[3]);
1780 ge_p1p1_to_p3(&u
,&t
);
1781 ge_p3_to_cached(&Ai
[4],&u
);
1783 ge_add(&t
,&A2
,&Ai
[4]);
1784 ge_p1p1_to_p3(&u
,&t
);
1785 ge_p3_to_cached(&Ai
[5],&u
);
1787 ge_add(&t
,&A2
,&Ai
[5]);
1788 ge_p1p1_to_p3(&u
,&t
);
1789 ge_p3_to_cached(&Ai
[6],&u
);
1791 ge_add(&t
,&A2
,&Ai
[6]);
1792 ge_p1p1_to_p3(&u
,&t
);
1793 ge_p3_to_cached(&Ai
[7],&u
);
1797 for (i
= 255; i
>= 0; --i
)
1799 if (aslide
[i
] || bslide
[i
])
1811 ge_p1p1_to_p3(&u
,&t
);
1812 ge_add(&t
,&u
,&Ai
[aslide
[i
]/2]);
1814 else if (aslide
[i
] < 0)
1816 ge_p1p1_to_p3(&u
,&t
);
1817 ge_sub(&t
,&u
,&Ai
[(-aslide
[i
])/2]);
1822 ge_p1p1_to_p3(&u
,&t
);
1823 ge_madd(&t
,&u
,&Bi
[bslide
[i
]/2]);
1825 else if (bslide
[i
] < 0)
1827 ge_p1p1_to_p3(&u
,&t
);
1828 ge_msub(&t
,&u
,&Bi
[(-bslide
[i
])/2]);
1830 ge_p1p1_to_p2(r
,&t
);
1834 void ge_scalarmult_base(ge_p3
*h
, const uint8_t *a
)
1843 for (i
= 0; i
< 32; ++i
)
1845 e
[2 * i
+ 0] = (a
[i
] >> 0) & 15;
1846 e
[2 * i
+ 1] = (a
[i
] >> 4) & 15;
1848 /* each e[i] is between 0 and 15 */
1849 /* e[63] is between 0 and 7 */
1851 for (i
= 0; i
< 63; ++i
) {
1855 e
[i
] -= carry
* ((int8_t) 1 << 4);
1858 /* each e[i] is between -8 and 8 */
1861 for (i
= 1; i
< 64; i
+= 2)
1863 ge_select(&t
,i
/ 2,e
[i
]);
1864 ge_madd(&r
,h
,&t
); ge_p1p1_to_p3(h
,&r
);
1867 ge_p3_dbl(&r
,h
); ge_p1p1_to_p2(&s
,&r
);
1868 ge_p2_dbl(&r
,&s
); ge_p1p1_to_p2(&s
,&r
);
1869 ge_p2_dbl(&r
,&s
); ge_p1p1_to_p2(&s
,&r
);
1870 ge_p2_dbl(&r
,&s
); ge_p1p1_to_p3(h
,&r
);
1872 for (i
= 0; i
< 64; i
+= 2)
1874 ge_select(&t
,i
/ 2,e
[i
]);
1876 ge_p1p1_to_p3(h
,&r
);
1882 * a[0]+256*a[1]+...+256^31*a[31] = a
1883 * b[0]+256*b[1]+...+256^31*b[31] = b
1884 * c[0]+256*c[1]+...+256^31*c[31] = c
1887 * s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1888 * where l = 2^252 + 27742317777372353535851937790883648493.
1890 void sc_muladd(uint8_t *s
, const uint8_t *a
, const uint8_t *b
, const uint8_t *c
)
1892 int64_t a0
= 2097151 & load_3(a
);
1893 int64_t a1
= 2097151 & (load_4(a
+ 2) >> 5);
1894 int64_t a2
= 2097151 & (load_3(a
+ 5) >> 2);
1895 int64_t a3
= 2097151 & (load_4(a
+ 7) >> 7);
1896 int64_t a4
= 2097151 & (load_4(a
+ 10) >> 4);
1897 int64_t a5
= 2097151 & (load_3(a
+ 13) >> 1);
1898 int64_t a6
= 2097151 & (load_4(a
+ 15) >> 6);
1899 int64_t a7
= 2097151 & (load_3(a
+ 18) >> 3);
1900 int64_t a8
= 2097151 & load_3(a
+ 21);
1901 int64_t a9
= 2097151 & (load_4(a
+ 23) >> 5);
1902 int64_t a10
= 2097151 & (load_3(a
+ 26) >> 2);
1903 int64_t a11
= (load_4(a
+ 28) >> 7);
1905 int64_t b0
= 2097151 & load_3(b
);
1906 int64_t b1
= 2097151 & (load_4(b
+ 2) >> 5);
1907 int64_t b2
= 2097151 & (load_3(b
+ 5) >> 2);
1908 int64_t b3
= 2097151 & (load_4(b
+ 7) >> 7);
1909 int64_t b4
= 2097151 & (load_4(b
+ 10) >> 4);
1910 int64_t b5
= 2097151 & (load_3(b
+ 13) >> 1);
1911 int64_t b6
= 2097151 & (load_4(b
+ 15) >> 6);
1912 int64_t b7
= 2097151 & (load_3(b
+ 18) >> 3);
1913 int64_t b8
= 2097151 & load_3(b
+ 21);
1914 int64_t b9
= 2097151 & (load_4(b
+ 23) >> 5);
1915 int64_t b10
= 2097151 & (load_3(b
+ 26) >> 2);
1916 int64_t b11
= (load_4(b
+ 28) >> 7);
1918 int64_t c0
= 2097151 & load_3(c
);
1919 int64_t c1
= 2097151 & (load_4(c
+ 2) >> 5);
1920 int64_t c2
= 2097151 & (load_3(c
+ 5) >> 2);
1921 int64_t c3
= 2097151 & (load_4(c
+ 7) >> 7);
1922 int64_t c4
= 2097151 & (load_4(c
+ 10) >> 4);
1923 int64_t c5
= 2097151 & (load_3(c
+ 13) >> 1);
1924 int64_t c6
= 2097151 & (load_4(c
+ 15) >> 6);
1925 int64_t c7
= 2097151 & (load_3(c
+ 18) >> 3);
1926 int64_t c8
= 2097151 & load_3(c
+ 21);
1927 int64_t c9
= 2097151 & (load_4(c
+ 23) >> 5);
1928 int64_t c10
= 2097151 & (load_3(c
+ 26) >> 2);
1929 int64_t c11
= (load_4(c
+ 28) >> 7);
1931 int64_t s0
, s1
, s2
, s3
, s4
, s5
, s6
, s7
, s8
, s9
, s10
, s11
;
1932 int64_t s12
, s13
, s14
, s15
, s16
, s17
, s18
, s19
, s20
, s21
, s22
, s23
;
1934 int64_t carry0
, carry1
, carry2
, carry3
, carry4
, carry5
, carry6
;
1935 int64_t carry7
, carry8
, carry9
, carry10
, carry11
, carry12
, carry13
;
1936 int64_t carry14
, carry15
, carry16
, carry17
, carry18
, carry19
, carry20
;
1937 int64_t carry21
, carry22
;
1940 s1
= c1
+ a0
*b1
+ a1
*b0
;
1941 s2
= c2
+ a0
*b2
+ a1
*b1
+ a2
*b0
;
1942 s3
= c3
+ a0
*b3
+ a1
*b2
+ a2
*b1
+ a3
*b0
;
1943 s4
= c4
+ a0
*b4
+ a1
*b3
+ a2
*b2
+ a3
*b1
+ a4
*b0
;
1944 s5
= c5
+ a0
*b5
+ a1
*b4
+ a2
*b3
+ a3
*b2
+ a4
*b1
+ a5
*b0
;
1945 s6
= c6
+ a0
*b6
+ a1
*b5
+ a2
*b4
+ a3
*b3
+ a4
*b2
+ a5
*b1
+ a6
*b0
;
1946 s7
= c7
+ a0
*b7
+ a1
*b6
+ a2
*b5
+ a3
*b4
+ a4
*b3
+ a5
*b2
+ a6
*b1
+ a7
*b0
;
1947 s8
= c8
+ a0
*b8
+ a1
*b7
+ a2
*b6
+ a3
*b5
+ a4
*b4
+ a5
*b3
+ a6
*b2
+ a7
*b1
+ a8
*b0
;
1948 s9
= c9
+ a0
*b9
+ a1
*b8
+ a2
*b7
+ a3
*b6
+ a4
*b5
+ a5
*b4
+ a6
*b3
+ a7
*b2
+ a8
*b1
+ a9
*b0
;
1949 s10
= c10
+ a0
*b10
+ a1
*b9
+ a2
*b8
+ a3
*b7
+ a4
*b6
+ a5
*b5
+ a6
*b4
+ a7
*b3
+ a8
*b2
+ a9
*b1
+ a10
*b0
;
1950 s11
= c11
+ a0
*b11
+ a1
*b10
+ a2
*b9
+ a3
*b8
+ a4
*b7
+ a5
*b6
+ a6
*b5
+ a7
*b4
+ a8
*b3
+ a9
*b2
+ a10
*b1
+ a11
*b0
;
1951 s12
= a1
*b11
+ a2
*b10
+ a3
*b9
+ a4
*b8
+ a5
*b7
+ a6
*b6
+ a7
*b5
+ a8
*b4
+ a9
*b3
+ a10
*b2
+ a11
*b1
;
1952 s13
= a2
*b11
+ a3
*b10
+ a4
*b9
+ a5
*b8
+ a6
*b7
+ a7
*b6
+ a8
*b5
+ a9
*b4
+ a10
*b3
+ a11
*b2
;
1953 s14
= a3
*b11
+ a4
*b10
+ a5
*b9
+ a6
*b8
+ a7
*b7
+ a8
*b6
+ a9
*b5
+ a10
*b4
+ a11
*b3
;
1954 s15
= a4
*b11
+ a5
*b10
+ a6
*b9
+ a7
*b8
+ a8
*b7
+ a9
*b6
+ a10
*b5
+ a11
*b4
;
1955 s16
= a5
*b11
+ a6
*b10
+ a7
*b9
+ a8
*b8
+ a9
*b7
+ a10
*b6
+ a11
*b5
;
1956 s17
= a6
*b11
+ a7
*b10
+ a8
*b9
+ a9
*b8
+ a10
*b7
+ a11
*b6
;
1957 s18
= a7
*b11
+ a8
*b10
+ a9
*b9
+ a10
*b8
+ a11
*b7
;
1958 s19
= a8
*b11
+ a9
*b10
+ a10
*b9
+ a11
*b8
;
1959 s20
= a9
*b11
+ a10
*b10
+ a11
*b9
;
1960 s21
= a10
*b11
+ a11
*b10
;
1964 carry0
= (s0
+ (int64_t) (1L << 20)) >> 21;
1966 s0
-= carry0
* ((uint64_t) 1L << 21);
1968 carry2
= (s2
+ (int64_t) (1L << 20)) >> 21;
1970 s2
-= carry2
* ((uint64_t) 1L << 21);
1972 carry4
= (s4
+ (int64_t) (1L << 20)) >> 21;
1974 s4
-= carry4
* ((uint64_t) 1L << 21);
1976 carry6
= (s6
+ (int64_t) (1L << 20)) >> 21;
1978 s6
-= carry6
* ((uint64_t) 1L << 21);
1980 carry8
= (s8
+ (int64_t) (1L << 20)) >> 21;
1982 s8
-= carry8
* ((uint64_t) 1L << 21);
1984 carry10
= (s10
+ (int64_t) (1L << 20)) >> 21;
1986 s10
-= carry10
* ((uint64_t) 1L << 21);
1988 carry12
= (s12
+ (int64_t) (1L << 20)) >> 21;
1990 s12
-= carry12
* ((uint64_t) 1L << 21);
1992 carry14
= (s14
+ (int64_t) (1L << 20)) >> 21;
1994 s14
-= carry14
* ((uint64_t) 1L << 21);
1996 carry16
= (s16
+ (int64_t) (1L << 20)) >> 21;
1998 s16
-= carry16
* ((uint64_t) 1L << 21);
2000 carry18
= (s18
+ (int64_t) (1L << 20)) >> 21;
2002 s18
-= carry18
* ((uint64_t) 1L << 21);
2004 carry20
= (s20
+ (int64_t) (1L << 20)) >> 21;
2006 s20
-= carry20
* ((uint64_t) 1L << 21);
2008 carry22
= (s22
+ (int64_t) (1L << 20)) >> 21;
2010 s22
-= carry22
* ((uint64_t) 1L << 21);
2012 carry1
= (s1
+ (int64_t) (1L << 20)) >> 21;
2014 s1
-= carry1
* ((uint64_t) 1L << 21);
2016 carry3
= (s3
+ (int64_t) (1L << 20)) >> 21;
2018 s3
-= carry3
* ((uint64_t) 1L << 21);
2020 carry5
= (s5
+ (int64_t) (1L << 20)) >> 21;
2022 s5
-= carry5
* ((uint64_t) 1L << 21);
2024 carry7
= (s7
+ (int64_t) (1L << 20)) >> 21;
2026 s7
-= carry7
* ((uint64_t) 1L << 21);
2028 carry9
= (s9
+ (int64_t) (1L << 20)) >> 21;
2030 s9
-= carry9
* ((uint64_t) 1L << 21);
2032 carry11
= (s11
+ (int64_t) (1L << 20)) >> 21;
2034 s11
-= carry11
* ((uint64_t) 1L << 21);
2036 carry13
= (s13
+ (int64_t) (1L << 20)) >> 21;
2038 s13
-= carry13
* ((uint64_t) 1L << 21);
2040 carry15
= (s15
+ (int64_t) (1L << 20)) >> 21;
2042 s15
-= carry15
* ((uint64_t) 1L << 21);
2044 carry17
= (s17
+ (int64_t) (1L << 20)) >> 21;
2046 s17
-= carry17
* ((uint64_t) 1L << 21);
2048 carry19
= (s19
+ (int64_t) (1L << 20)) >> 21;
2050 s19
-= carry19
* ((uint64_t) 1L << 21);
2052 carry21
= (s21
+ (int64_t) (1L << 20)) >> 21;
2054 s21
-= carry21
* ((uint64_t) 1L << 21);
2056 s11
+= s23
* 666643;
2057 s12
+= s23
* 470296;
2058 s13
+= s23
* 654183;
2059 s14
-= s23
* 997805;
2060 s15
+= s23
* 136657;
2061 s16
-= s23
* 683901;
2063 s10
+= s22
* 666643;
2064 s11
+= s22
* 470296;
2065 s12
+= s22
* 654183;
2066 s13
-= s22
* 997805;
2067 s14
+= s22
* 136657;
2068 s15
-= s22
* 683901;
2071 s10
+= s21
* 470296;
2072 s11
+= s21
* 654183;
2073 s12
-= s21
* 997805;
2074 s13
+= s21
* 136657;
2075 s14
-= s21
* 683901;
2079 s10
+= s20
* 654183;
2080 s11
-= s20
* 997805;
2081 s12
+= s20
* 136657;
2082 s13
-= s20
* 683901;
2087 s10
-= s19
* 997805;
2088 s11
+= s19
* 136657;
2089 s12
-= s19
* 683901;
2095 s10
+= s18
* 136657;
2096 s11
-= s18
* 683901;
2098 carry6
= (s6
+ (int64_t) (1L << 20)) >> 21;
2100 s6
-= carry6
* ((uint64_t) 1L << 21);
2102 carry8
= (s8
+ (int64_t) (1L << 20)) >> 21;
2104 s8
-= carry8
* ((uint64_t) 1L << 21);
2106 carry10
= (s10
+ (int64_t) (1L << 20)) >> 21;
2108 s10
-= carry10
* ((uint64_t) 1L << 21);
2110 carry12
= (s12
+ (int64_t) (1L << 20)) >> 21;
2112 s12
-= carry12
* ((uint64_t) 1L << 21);
2114 carry14
= (s14
+ (int64_t) (1L << 20)) >> 21;
2116 s14
-= carry14
* ((uint64_t) 1L << 21);
2118 carry16
= (s16
+ (int64_t) (1L << 20)) >> 21;
2120 s16
-= carry16
* ((uint64_t) 1L << 21);
2122 carry7
= (s7
+ (int64_t) (1L << 20)) >> 21;
2124 s7
-= carry7
* ((uint64_t) 1L << 21);
2126 carry9
= (s9
+ (int64_t) (1L << 20)) >> 21;
2128 s9
-= carry9
* ((uint64_t) 1L << 21);
2130 carry11
= (s11
+ (int64_t) (1L << 20)) >> 21;
2132 s11
-= carry11
* ((uint64_t) 1L << 21);
2134 carry13
= (s13
+ (int64_t) (1L << 20)) >> 21;
2136 s13
-= carry13
* ((uint64_t) 1L << 21);
2138 carry15
= (s15
+ (int64_t) (1L << 20)) >> 21;
2140 s15
-= carry15
* ((uint64_t) 1L << 21);
2147 s10
-= s17
* 683901;
2185 carry0
= (s0
+ (int64_t) (1L << 20)) >> 21;
2187 s0
-= carry0
* ((uint64_t) 1L << 21);
2189 carry2
= (s2
+ (int64_t) (1L << 20)) >> 21;
2191 s2
-= carry2
* ((uint64_t) 1L << 21);
2193 carry4
= (s4
+ (int64_t) (1L << 20)) >> 21;
2195 s4
-= carry4
* ((uint64_t) 1L << 21);
2197 carry6
= (s6
+ (int64_t) (1L << 20)) >> 21;
2199 s6
-= carry6
* ((uint64_t) 1L << 21);
2201 carry8
= (s8
+ (int64_t) (1L << 20)) >> 21;
2203 s8
-= carry8
* ((uint64_t) 1L << 21);
2205 carry10
= (s10
+ (int64_t) (1L << 20)) >> 21;
2207 s10
-= carry10
* ((uint64_t) 1L << 21);
2209 carry1
= (s1
+ (int64_t) (1L << 20)) >> 21;
2211 s1
-= carry1
* ((uint64_t) 1L << 21);
2213 carry3
= (s3
+ (int64_t) (1L << 20)) >> 21;
2215 s3
-= carry3
* ((uint64_t) 1L << 21);
2217 carry5
= (s5
+ (int64_t) (1L << 20)) >> 21;
2219 s5
-= carry5
* ((uint64_t) 1L << 21);
2221 carry7
= (s7
+ (int64_t) (1L << 20)) >> 21;
2223 s7
-= carry7
* ((uint64_t) 1L << 21);
2225 carry9
= (s9
+ (int64_t) (1L << 20)) >> 21;
2227 s9
-= carry9
* ((uint64_t) 1L << 21);
2229 carry11
= (s11
+ (int64_t) (1L << 20)) >> 21;
2231 s11
-= carry11
* ((uint64_t) 1L << 21);
2243 s0
-= carry0
* ((uint64_t) 1L << 21);
2247 s1
-= carry1
* ((uint64_t) 1L << 21);
2251 s2
-= carry2
* ((uint64_t) 1L << 21);
2255 s3
-= carry3
* ((uint64_t) 1L << 21);
2259 s4
-= carry4
* ((uint64_t) 1L << 21);
2263 s5
-= carry5
* ((uint64_t) 1L << 21);
2267 s6
-= carry6
* ((uint64_t) 1L << 21);
2271 s7
-= carry7
* ((uint64_t) 1L << 21);
2275 s8
-= carry8
* ((uint64_t) 1L << 21);
2279 s9
-= carry9
* ((uint64_t) 1L << 21);
2281 carry10
= s10
>> 21;
2283 s10
-= carry10
* ((uint64_t) 1L << 21);
2285 carry11
= s11
>> 21;
2287 s11
-= carry11
* ((uint64_t) 1L << 21);
2298 s0
-= carry0
* ((uint64_t) 1L << 21);
2302 s1
-= carry1
* ((uint64_t) 1L << 21);
2306 s2
-= carry2
* ((uint64_t) 1L << 21);
2310 s3
-= carry3
* ((uint64_t) 1L << 21);
2314 s4
-= carry4
* ((uint64_t) 1L << 21);
2318 s5
-= carry5
* ((uint64_t) 1L << 21);
2322 s6
-= carry6
* ((uint64_t) 1L << 21);
2326 s7
-= carry7
* ((uint64_t) 1L << 21);
2330 s8
-= carry8
* ((uint64_t) 1L << 21);
2334 s9
-= carry9
* ((uint64_t) 1L << 21);
2336 carry10
= s10
>> 21;
2338 s10
-= carry10
* ((uint64_t) 1L << 21);
2342 s
[2] = (s0
>> 16) | (s1
* ((uint64_t) 1 << 5));
2345 s
[5] = (s1
>> 19) | (s2
* ((uint64_t) 1 << 2));
2347 s
[7] = (s2
>> 14) | (s3
* ((uint64_t) 1 << 7));
2350 s
[10] = (s3
>> 17) | (s4
* ((uint64_t) 1 << 4));
2353 s
[13] = (s4
>> 20) | (s5
* ((uint64_t) 1 << 1));
2355 s
[15] = (s5
>> 15) | (s6
* ((uint64_t) 1 << 6));
2358 s
[18] = (s6
>> 18) | (s7
* ((uint64_t) 1 << 3));
2363 s
[23] = (s8
>> 16) | (s9
* ((uint64_t) 1 << 5));
2366 s
[26] = (s9
>> 19) | (s10
* ((uint64_t) 1 << 2));
2368 s
[28] = (s10
>> 14) | (s11
* ((uint64_t) 1 << 7));
2376 * s[0]+256*s[1]+...+256^63*s[63] = s
2379 * s[0]+256*s[1]+...+256^31*s[31] = s mod l
2380 * where l = 2^252 + 27742317777372353535851937790883648493.
2381 * Overwrites s in place.
2383 void sc_reduce(uint8_t *s
)
2385 int64_t s0
= 2097151 & load_3(s
);
2386 int64_t s1
= 2097151 & (load_4(s
+ 2) >> 5);
2387 int64_t s2
= 2097151 & (load_3(s
+ 5) >> 2);
2388 int64_t s3
= 2097151 & (load_4(s
+ 7) >> 7);
2389 int64_t s4
= 2097151 & (load_4(s
+ 10) >> 4);
2390 int64_t s5
= 2097151 & (load_3(s
+ 13) >> 1);
2391 int64_t s6
= 2097151 & (load_4(s
+ 15) >> 6);
2392 int64_t s7
= 2097151 & (load_3(s
+ 18) >> 3);
2393 int64_t s8
= 2097151 & load_3(s
+ 21);
2394 int64_t s9
= 2097151 & (load_4(s
+ 23) >> 5);
2395 int64_t s10
= 2097151 & (load_3(s
+ 26) >> 2);
2396 int64_t s11
= 2097151 & (load_4(s
+ 28) >> 7);
2397 int64_t s12
= 2097151 & (load_4(s
+ 31) >> 4);
2398 int64_t s13
= 2097151 & (load_3(s
+ 34) >> 1);
2399 int64_t s14
= 2097151 & (load_4(s
+ 36) >> 6);
2400 int64_t s15
= 2097151 & (load_3(s
+ 39) >> 3);
2401 int64_t s16
= 2097151 & load_3(s
+ 42);
2402 int64_t s17
= 2097151 & (load_4(s
+ 44) >> 5);
2403 int64_t s18
= 2097151 & (load_3(s
+ 47) >> 2);
2404 int64_t s19
= 2097151 & (load_4(s
+ 49) >> 7);
2405 int64_t s20
= 2097151 & (load_4(s
+ 52) >> 4);
2406 int64_t s21
= 2097151 & (load_3(s
+ 55) >> 1);
2407 int64_t s22
= 2097151 & (load_4(s
+ 57) >> 6);
2408 int64_t s23
= (load_4(s
+ 60) >> 3);
2410 int64_t carry0
, carry1
, carry2
, carry3
, carry4
, carry5
, carry6
;
2411 int64_t carry7
, carry8
, carry9
, carry10
, carry11
, carry12
, carry13
;
2412 int64_t carry14
, carry15
, carry16
;
2414 s11
+= s23
* 666643;
2415 s12
+= s23
* 470296;
2416 s13
+= s23
* 654183;
2417 s14
-= s23
* 997805;
2418 s15
+= s23
* 136657;
2419 s16
-= s23
* 683901;
2421 s10
+= s22
* 666643;
2422 s11
+= s22
* 470296;
2423 s12
+= s22
* 654183;
2424 s13
-= s22
* 997805;
2425 s14
+= s22
* 136657;
2426 s15
-= s22
* 683901;
2429 s10
+= s21
* 470296;
2430 s11
+= s21
* 654183;
2431 s12
-= s21
* 997805;
2432 s13
+= s21
* 136657;
2433 s14
-= s21
* 683901;
2437 s10
+= s20
* 654183;
2438 s11
-= s20
* 997805;
2439 s12
+= s20
* 136657;
2440 s13
-= s20
* 683901;
2445 s10
-= s19
* 997805;
2446 s11
+= s19
* 136657;
2447 s12
-= s19
* 683901;
2453 s10
+= s18
* 136657;
2454 s11
-= s18
* 683901;
2456 carry6
= (s6
+ (int64_t) (1L << 20)) >> 21;
2458 s6
-= carry6
* ((uint64_t) 1L << 21);
2460 carry8
= (s8
+ (int64_t) (1L << 20)) >> 21;
2462 s8
-= carry8
* ((uint64_t) 1L << 21);
2464 carry10
= (s10
+ (int64_t) (1L << 20)) >> 21;
2466 s10
-= carry10
* ((uint64_t) 1L << 21);
2468 carry12
= (s12
+ (int64_t) (1L << 20)) >> 21;
2470 s12
-= carry12
* ((uint64_t) 1L << 21);
2472 carry14
= (s14
+ (int64_t) (1L << 20)) >> 21;
2474 s14
-= carry14
* ((uint64_t) 1L << 21);
2476 carry16
= (s16
+ (int64_t) (1L << 20)) >> 21;
2478 s16
-= carry16
* ((uint64_t) 1L << 21);
2480 carry7
= (s7
+ (int64_t) (1L << 20)) >> 21;
2482 s7
-= carry7
* ((uint64_t) 1L << 21);
2484 carry9
= (s9
+ (int64_t) (1L << 20)) >> 21;
2486 s9
-= carry9
* ((uint64_t) 1L << 21);
2488 carry11
= (s11
+ (int64_t) (1L << 20)) >> 21;
2490 s11
-= carry11
* ((uint64_t) 1L << 21);
2492 carry13
= (s13
+ (int64_t) (1L << 20)) >> 21;
2494 s13
-= carry13
* ((uint64_t) 1L << 21);
2496 carry15
= (s15
+ (int64_t) (1L << 20)) >> 21;
2498 s15
-= carry15
* ((uint64_t) 1L << 21);
2505 s10
-= s17
* 683901;
2543 carry0
= (s0
+ (int64_t) (1L << 20)) >> 21;
2545 s0
-= carry0
* ((uint64_t) 1L << 21);
2547 carry2
= (s2
+ (int64_t) (1L << 20)) >> 21;
2549 s2
-= carry2
* ((uint64_t) 1L << 21);
2551 carry4
= (s4
+ (int64_t) (1L << 20)) >> 21;
2553 s4
-= carry4
* ((uint64_t) 1L << 21);
2555 carry6
= (s6
+ (int64_t) (1L << 20)) >> 21;
2557 s6
-= carry6
* ((uint64_t) 1L << 21);
2559 carry8
= (s8
+ (int64_t) (1L << 20)) >> 21;
2561 s8
-= carry8
* ((uint64_t) 1L << 21);
2563 carry10
= (s10
+ (int64_t) (1L << 20)) >> 21;
2565 s10
-= carry10
* ((uint64_t) 1L << 21);
2567 carry1
= (s1
+ (int64_t) (1L << 20)) >> 21;
2569 s1
-= carry1
* ((uint64_t) 1L << 21);
2571 carry3
= (s3
+ (int64_t) (1L << 20)) >> 21;
2573 s3
-= carry3
* ((uint64_t) 1L << 21);
2575 carry5
= (s5
+ (int64_t) (1L << 20)) >> 21;
2577 s5
-= carry5
* ((uint64_t) 1L << 21);
2579 carry7
= (s7
+ (int64_t) (1L << 20)) >> 21;
2581 s7
-= carry7
* ((uint64_t) 1L << 21);
2583 carry9
= (s9
+ (int64_t) (1L << 20)) >> 21;
2585 s9
-= carry9
* ((uint64_t) 1L << 21);
2587 carry11
= (s11
+ (int64_t) (1L << 20)) >> 21;
2589 s11
-= carry11
* ((uint64_t) 1L << 21);
2601 s0
-= carry0
* ((uint64_t) 1L << 21);
2605 s1
-= carry1
* ((uint64_t) 1L << 21);
2609 s2
-= carry2
* ((uint64_t) 1L << 21);
2613 s3
-= carry3
* ((uint64_t) 1L << 21);
2617 s4
-= carry4
* ((uint64_t) 1L << 21);
2621 s5
-= carry5
* ((uint64_t) 1L << 21);
2625 s6
-= carry6
* ((uint64_t) 1L << 21);
2629 s7
-= carry7
* ((uint64_t) 1L << 21);
2633 s8
-= carry8
* ((uint64_t) 1L << 21);
2637 s9
-= carry9
* ((uint64_t) 1L << 21);
2639 carry10
= s10
>> 21;
2641 s10
-= carry10
* ((uint64_t) 1L << 21);
2643 carry11
= s11
>> 21;
2645 s11
-= carry11
* ((uint64_t) 1L << 21);
2656 s0
-= carry0
* ((uint64_t) 1L << 21);
2660 s1
-= carry1
* ((uint64_t) 1L << 21);
2664 s2
-= carry2
* ((uint64_t) 1L << 21);
2668 s3
-= carry3
* ((uint64_t) 1L << 21);
2672 s4
-= carry4
* ((uint64_t) 1L << 21);
2676 s5
-= carry5
* ((uint64_t) 1L << 21);
2680 s6
-= carry6
* ((uint64_t) 1L << 21);
2684 s7
-= carry7
* ((uint64_t) 1L << 21);
2688 s8
-= carry8
* ((uint64_t) 1L << 21);
2692 s9
-= carry9
* ((uint64_t) 1L << 21);
2694 carry10
= s10
>> 21;
2696 s10
-= carry10
* ((uint64_t) 1L << 21);
2700 s
[2] = (s0
>> 16) | (s1
* ((uint64_t) 1 << 5));
2703 s
[5] = (s1
>> 19) | (s2
* ((uint64_t) 1 << 2));
2705 s
[7] = (s2
>> 14) | (s3
* ((uint64_t) 1 << 7));
2708 s
[10] = (s3
>> 17) | (s4
* ((uint64_t) 1 << 4));
2711 s
[13] = (s4
>> 20) | (s5
* ((uint64_t) 1 << 1));
2713 s
[15] = (s5
>> 15) | (s6
* ((uint64_t) 1 << 6));
2716 s
[18] = (s6
>> 18) | (s7
* ((uint64_t) 1 << 3));
2721 s
[23] = (s8
>> 16) | (s9
* ((uint64_t) 1 << 5));
2724 s
[26] = (s9
>> 19) | (s10
* ((uint64_t) 1 << 2));
2726 s
[28] = (s10
>> 14) | (s11
* ((uint64_t) 1 << 7));