unsigned i;
mp_limb_t cy;
- /* FIXME: Could save some more scratch space, e.g., by letting BB
- overlap C, D, and CB overlap A, D. And possibly reusing some of
- x2, z2, x3, z3. */
#define x2 (scratch)
#define z2 (scratch + m->size)
#define x3 (scratch + 2*m->size)
#define z3 (scratch + 3*m->size)
-#define A (scratch + 4*m->size)
-#define B (scratch + 5*m->size)
-#define C (scratch + 6*m->size)
-#define D (scratch + 7*m->size)
-#define AA (scratch + 8*m->size)
-#define BB (scratch +9*m->size)
-#define E (scratch + 9*m->size) /* Overlap BB */
-#define DA (scratch + 8*m->size) /* Overlap AA */
-#define CB (scratch + 9*m->size) /* Overlap BB */
+ /* Formulas from RFC 7748:
+
+ A = x_2 + z_2
+ AA = A^2
+ B = x_2 - z_2
+ BB = B^2
+ E = AA - BB
+ C = x_3 + z_3
+ D = x_3 - z_3
+ DA = D * A
+ CB = C * B
+ x_3 = (DA + CB)^2
+ z_3 = x_1 * (DA - CB)^2
+ x_2 = AA * BB
+ z_2 = E * (AA + a24 * E)
+
+ For pure doubling, we use:
+
+ A = x_2 + z_2
+ AA = A^2
+ B = x_2 - z_2
+ BB = B^2
+ E = AA - BB
+ x3 = AA * BB
+ z3 = E * (AA + a24 * E)
+ */
+
+#define A (scratch + 4*m->size)
+#define AA A
+#define D (scratch + 5*m->size)
+#define DA D
+
+#define tp (scratch + 6*m->size)
+
+ /* For the doubling formulas. */
+#define B D
+#define BB D
+#define E D
/* Initialize, x2 = px, z2 = 1 */
mpn_copyi (x2, px, m->size);
/* Get x3, z3 from doubling. Since most significant bit is forced to 1. */
ecc_mod_add (m, A, x2, z2);
ecc_mod_sub (m, B, x2, z2);
- ecc_mod_sqr (m, AA, A, AA);
- ecc_mod_sqr (m, BB, B, BB);
- ecc_mod_mul (m, x3, AA, BB, x3);
+ ecc_mod_sqr (m, AA, A, tp);
+ ecc_mod_sqr (m, BB, B, tp);
+ ecc_mod_mul (m, x3, AA, BB, tp);
ecc_mod_sub (m, E, AA, BB);
ecc_mod_addmul_1 (m, AA, E, a24);
- ecc_mod_mul (m, z3, E, AA, z3);
+ ecc_mod_mul (m, z3, E, AA, tp);
for (i = bit_high; i >= bit_low; i--)
{
mpn_cnd_swap (bit, x2, x3, 2*m->size);
- /* Formulas from RFC 7748. We compute new coordinates in
- memory-address order, since mul and sqr clobbers higher
- limbs. */
ecc_mod_add (m, A, x2, z2);
- ecc_mod_sub (m, B, x2, z2);
- ecc_mod_sqr (m, AA, A, AA);
- ecc_mod_sqr (m, BB, B, BB);
- ecc_mod_mul (m, x2, AA, BB, x2); /* Last use of BB */
- ecc_mod_sub (m, E, AA, BB);
- ecc_mod_addmul_1 (m, AA, E, a24);
- ecc_mod_add (m, C, x3, z3);
ecc_mod_sub (m, D, x3, z3);
- ecc_mod_mul (m, z2, E, AA, z2); /* Last use of E and AA */
- ecc_mod_mul (m, DA, D, A, DA); /* Last use of D, A. FIXME: could
- let CB overlap. */
- ecc_mod_mul (m, CB, C, B, CB);
+ ecc_mod_mul (m, DA, D, A, tp);
+ ecc_mod_sqr (m, AA, A, tp);
+
+ /* Store B, BB and E at z2 */
+ ecc_mod_sub (m, z2, x2, z2); /* B */
+ /* Store C and CB at z3 */
+ ecc_mod_add (m, z3, x3, z3); /* C */
+ ecc_mod_mul (m, z3, z3, z2, tp); /* CB */
+ ecc_mod_sqr (m, z2, z2, tp); /* BB */
+
+ /* Finish x2 */
+ ecc_mod_mul (m, x2, AA, z2, tp);
+
+ ecc_mod_sub (m, z2, AA, z2); /* E */
+
+ /* Finish z2 */
+ ecc_mod_addmul_1 (m, AA, z2, a24);
+ ecc_mod_mul (m, z2, z2, AA, tp);
+
+ /* Finish x3 */
+ ecc_mod_add (m, x3, DA, z3);
+ ecc_mod_sqr (m, x3, x3, tp);
- ecc_mod_add (m, C, DA, CB);
- ecc_mod_sqr (m, x3, C, x3);
- ecc_mod_sub (m, C, DA, CB);
- ecc_mod_sqr (m, DA, C, DA);
- ecc_mod_mul (m, z3, DA, px, z3);
+ /* Finish z3 */
+ ecc_mod_sub (m, z3, DA, z3); /* DA - CB */
+ ecc_mod_sqr (m, z3, z3, tp);
+ ecc_mod_mul (m, z3, z3, px, tp);
/* FIXME: Could be combined with the loop's initial mpn_cnd_swap. */
mpn_cnd_swap (bit, x2, x3, 2*m->size);
{
ecc_mod_add (m, A, x2, z2);
ecc_mod_sub (m, B, x2, z2);
- ecc_mod_sqr (m, AA, A, AA);
- ecc_mod_sqr (m, BB, B, BB);
- ecc_mod_mul (m, x2, AA, BB, x2);
+ ecc_mod_sqr (m, AA, A, tp);
+ ecc_mod_sqr (m, BB, B, tp);
+ ecc_mod_mul (m, x2, AA, BB, tp);
ecc_mod_sub (m, E, AA, BB);
ecc_mod_addmul_1 (m, AA, E, a24);
- ecc_mod_mul (m, z2, E, AA, z2);
+ ecc_mod_mul (m, z2, E, AA, tp);
}
assert (m->invert_itch <= 7 * m->size);
m->invert (m, x3, z2, z3 + m->size);