From: Niels Möller Date: Sun, 8 Nov 2020 10:17:20 +0000 (+0100) Subject: Reduce scratch need for ecc_add_jjj X-Git-Tag: nettle_3.7rc1~52^2~5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=110efbf4b740746cb9067dab194d3e652e3e92a2;p=thirdparty%2Fnettle.git Reduce scratch need for ecc_add_jjj --- diff --git a/ChangeLog b/ChangeLog index a4333788..8846f1d1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2020-11-08 Niels Möller + + * ecc-add-jjj.c (ecc_add_jjj): Reduce scratch need. + * ecc-internal.h (ECC_ADD_JJJ_ITCH): Now 6*size. + 2020-11-06 Niels Möller After these changes, both curve25519 and curve448 need 4*size for diff --git a/ecc-add-jjj.c b/ecc-add-jjj.c index 5c416b81..a5a7e7a0 100644 --- a/ecc-add-jjj.c +++ b/ecc-add-jjj.c @@ -43,6 +43,17 @@ ecc_add_jjj (const struct ecc_curve *ecc, mp_limb_t *r, const mp_limb_t *p, const mp_limb_t *q, mp_limb_t *scratch) { +#define x1 p +#define y1 (p + ecc->p.size) +#define z1 (p + 2*ecc->p.size) + +#define x2 q +#define y2 (q + ecc->p.size) +#define z2 (q + 2*ecc->p.size) + +#define x3 r +#define y3 (r + ecc->p.size) +#define z3 (r + 2*ecc->p.size) /* Formulas, from djb, http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl: @@ -63,58 +74,62 @@ ecc_add_jjj (const struct ecc_curve *ecc, X3 = W^2-J-2*V sqr S1, W, J, V Y3 = W*(V-X3)-2*S1*J mul, mul */ - mp_limb_t *z1z1 = scratch; - mp_limb_t *z2z2 = scratch + ecc->p.size; - mp_limb_t *u1 = scratch + 2*ecc->p.size; - mp_limb_t *u2 = scratch + 3*ecc->p.size; - mp_limb_t *s1 = scratch; /* overlap z1z1 */ - mp_limb_t *s2 = scratch + ecc->p.size; /* overlap z2z2 */ - mp_limb_t *i = scratch + 4*ecc->p.size; - mp_limb_t *j = scratch + 5*ecc->p.size; - mp_limb_t *v = scratch + 6*ecc->p.size; - - /* z1^2, z2^2, u1 = x1 x2^2, u2 = x2 z1^2 - u1 */ - ecc_mod_sqr (&ecc->p, z1z1, p + 2*ecc->p.size, z1z1); - ecc_mod_sqr (&ecc->p, z2z2, q + 2*ecc->p.size, z2z2); - ecc_mod_mul (&ecc->p, u1, p, z2z2, u1); - ecc_mod_mul (&ecc->p, u2, q, z1z1, u2); - ecc_mod_sub (&ecc->p, u2, u2, u1); /* Store h in u2 */ - - /* z3, use i, j, v as scratch, result at i. */ - ecc_mod_add (&ecc->p, i, p + 2*ecc->p.size, q + 2*ecc->p.size); - ecc_mod_sqr (&ecc->p, v, i, v); - ecc_mod_sub (&ecc->p, v, v, z1z1); - ecc_mod_sub (&ecc->p, v, v, z2z2); - ecc_mod_mul (&ecc->p, i, v, u2, i); - /* Delayed write, to support in-place operation. */ - - /* s1 = y1 z2^3, s2 = y2 z1^3, scratch at j and v */ - ecc_mod_mul (&ecc->p, j, z1z1, p + 2*ecc->p.size, j); /* z1^3 */ - ecc_mod_mul (&ecc->p, v, z2z2, q + 2*ecc->p.size, v); /* z2^3 */ - ecc_mod_mul (&ecc->p, s1, p + ecc->p.size, v, s1); - ecc_mod_mul (&ecc->p, v, j, q + ecc->p.size, v); - ecc_mod_sub (&ecc->p, s2, v, s1); - ecc_mod_mul_1 (&ecc->p, s2, s2, 2); - - /* Store z3 */ - mpn_copyi (r + 2*ecc->p.size, i, ecc->p.size); - - /* i, j, v */ - ecc_mod_sqr (&ecc->p, i, u2, i); - ecc_mod_mul_1 (&ecc->p, i, i, 4); - ecc_mod_mul (&ecc->p, j, u2, i, j); - ecc_mod_mul (&ecc->p, v, u1, i, v); - - /* now, u1, u2 and i are free for reuse .*/ - /* x3, use u1, u2 as scratch */ - ecc_mod_sqr (&ecc->p, u1, s2, u1); - ecc_mod_sub (&ecc->p, r, u1, j); - ecc_mod_submul_1 (&ecc->p, r, v, 2); - - /* y3 */ - ecc_mod_mul (&ecc->p, u1, s1, j, u1); /* Frees j */ - ecc_mod_sub (&ecc->p, u2, v, r); /* Frees v */ - ecc_mod_mul (&ecc->p, i, s2, u2, i); - ecc_mod_submul_1 (&ecc->p, i, u1, 2); - mpn_copyi (r + ecc->p.size, i, ecc->p.size); + +#define h scratch +#define z1z1 (scratch + ecc->p.size) +#define z2z2 (scratch + 2*ecc->p.size) +#define z1z2 (scratch + 3*ecc->p.size) + +#define w (scratch + ecc->p.size) +#define i (scratch + 2*ecc->p.size) +#define j h +#define v i + +#define tp (scratch + 4*ecc->p.size) + + ecc_mod_sqr (&ecc->p, z1z1, z1, tp); /* z1z1 */ + ecc_mod_sqr (&ecc->p, z2z2, z2, tp); /* z1z1, z2z2 */ + /* Store u1 at x3 */ + ecc_mod_mul (&ecc->p, x3, x1, z2z2, tp); /* z1z1, z2z2 */ + ecc_mod_mul (&ecc->p, h, x2, z1z1, tp); /* z1z1, z2z2, h */ + ecc_mod_sub (&ecc->p, h, h, x3); + + ecc_mod_add (&ecc->p, z1z2, z1, z2); /* z1z1, z2z2, z1z2, h */ + ecc_mod_sqr (&ecc->p, z1z2, z1z2, tp); + ecc_mod_sub (&ecc->p, z1z2, z1z2, z1z1); + ecc_mod_sub (&ecc->p, z1z2, z1z2, z2z2); + + /* z1^3, z2^3 */ + ecc_mod_mul (&ecc->p, z1z1, z1z1, z1, tp); + ecc_mod_mul (&ecc->p, z2z2, z2z2, z2, tp); + + /* z3 <-- h z1 z2 delayed until now, since that may clobber z1. */ + ecc_mod_mul (&ecc->p, z3, z1z2, h, tp); /* z1z1, z2z2, h */ + /* Store s1 at y3 */ + ecc_mod_mul (&ecc->p, y3, z2z2, y1, tp); /* z1z1, h */ + /* w = 2 (s2 - s1) */ + ecc_mod_mul (&ecc->p, w, z1z1, y2, tp); /* h, w */ + ecc_mod_sub (&ecc->p, w, w, y3); + ecc_mod_add (&ecc->p, w, w, w); + + /* i = (2h)^2 */ + ecc_mod_add (&ecc->p, i, h, h); /* h, w, i */ + ecc_mod_sqr (&ecc->p, i, i, tp); + + /* j and h can overlap */ + ecc_mod_mul (&ecc->p, j, h, i, tp); /* j, w, i */ + + /* v and i can overlap */ + ecc_mod_mul (&ecc->p, v, x3, i, tp); /* j, w, v */ + + /* x3 <-- w^2 - j - 2v */ + ecc_mod_sqr (&ecc->p, x3, w, tp); + ecc_mod_sub (&ecc->p, x3, x3, j); + ecc_mod_submul_1 (&ecc->p, x3, v, 2); + + /* y3 <-- w (v - x3) - 2 s1 j */ + ecc_mod_mul (&ecc->p, j, j, y3, tp); + ecc_mod_sub (&ecc->p, v, v, x3); + ecc_mod_mul (&ecc->p, y3, v, w, tp); + ecc_mod_submul_1 (&ecc->p, y3, j, 2); } diff --git a/ecc-internal.h b/ecc-internal.h index d7d0fe07..21913502 100644 --- a/ecc-internal.h +++ b/ecc-internal.h @@ -448,7 +448,7 @@ curve448_eh_to_x (mp_limb_t *xp, const mp_limb_t *p, #define ECC_DUP_EH_ITCH(size) (3*(size)) #define ECC_DUP_TH_ITCH(size) (3*(size)) #define ECC_ADD_JJA_ITCH(size) (6*(size)) -#define ECC_ADD_JJJ_ITCH(size) (8*(size)) +#define ECC_ADD_JJJ_ITCH(size) (6*(size)) #define ECC_ADD_EH_ITCH(size) (4*(size)) #define ECC_ADD_EHH_ITCH(size) (4*(size)) #define ECC_ADD_TH_ITCH(size) (4*(size))