2 * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 /* Copyright 2023 IBM Corp.
12 * Licensed under the Apache License, Version 2.0 (the "License");
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
17 * http://www.apache.org/licenses/LICENSE-2.0
19 * Unless required by applicable law or agreed to in writing, software
20 * distributed under the License is distributed on an "AS IS" BASIS,
21 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 * See the License for the specific language governing permissions and
23 * limitations under the License.
27 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29 * acceleration of felem_{square,mul} as supported in these files.
32 #include <openssl/e_os2.h>
35 #include <openssl/err.h>
38 #include "internal/numbers.h"
41 # error "Your compiler doesn't appear to support 128-bit integer types"
48 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49 * can serialize an element of this field into 48 bytes. We call this an
53 typedef u8 felem_bytearray
[48];
56 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57 * These values are big-endian.
59 static const felem_bytearray nistp384_curve_params
[5] = {
60 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68 {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72 {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76 {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
83 * The representation of field elements.
84 * ------------------------------------
86 * We represent field elements with seven values. These values are either 64 or
87 * 128 bits and the field element represented is:
88 * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p)
89 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90 * 56 bits apart, but are greater than 56 bits in length, the most significant
91 * bits of each limb overlap with the least significant bits of the next
93 * This representation is considered to be 'redundant' in the sense that
94 * intermediate values can each contain more than a 56-bit value in each limb.
95 * Reduction causes all but the final limb to be reduced to contain a value less
96 * than 2^56, with the final value represented allowed to be larger than 2^384,
97 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98 * reduced value must of course be congruent to the unreduced value.
100 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101 * 'widefelem', featuring enough bits to store the result of a multiplication
102 * and even some further arithmetic without need for immediate reduction.
107 typedef uint64_t limb
;
108 typedef uint128_t widelimb
;
109 typedef limb limb_aX
__attribute((__aligned__(1)));
110 typedef limb felem
[NLIMBS
];
111 typedef widelimb widefelem
[2*NLIMBS
-1];
113 static const limb bottom56bits
= 0xffffffffffffff;
115 /* Helper functions (de)serialising reduced field elements in little endian */
116 static void bin48_to_felem(felem out
, const u8 in
[48])
119 out
[0] = (*((limb
*) & in
[0])) & bottom56bits
;
120 out
[1] = (*((limb_aX
*) & in
[7])) & bottom56bits
;
121 out
[2] = (*((limb_aX
*) & in
[14])) & bottom56bits
;
122 out
[3] = (*((limb_aX
*) & in
[21])) & bottom56bits
;
123 out
[4] = (*((limb_aX
*) & in
[28])) & bottom56bits
;
124 out
[5] = (*((limb_aX
*) & in
[35])) & bottom56bits
;
125 memmove(&out
[6], &in
[42], 6);
128 static void felem_to_bin48(u8 out
[48], const felem in
)
131 (*((limb
*) & out
[0])) |= (in
[0] & bottom56bits
);
132 (*((limb_aX
*) & out
[7])) |= (in
[1] & bottom56bits
);
133 (*((limb_aX
*) & out
[14])) |= (in
[2] & bottom56bits
);
134 (*((limb_aX
*) & out
[21])) |= (in
[3] & bottom56bits
);
135 (*((limb_aX
*) & out
[28])) |= (in
[4] & bottom56bits
);
136 (*((limb_aX
*) & out
[35])) |= (in
[5] & bottom56bits
);
137 memmove(&out
[42], &in
[6], 6);
140 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141 static int BN_to_felem(felem out
, const BIGNUM
*bn
)
143 felem_bytearray b_out
;
146 if (BN_is_negative(bn
)) {
147 ERR_raise(ERR_LIB_EC
, EC_R_BIGNUM_OUT_OF_RANGE
);
150 num_bytes
= BN_bn2lebinpad(bn
, b_out
, sizeof(b_out
));
152 ERR_raise(ERR_LIB_EC
, EC_R_BIGNUM_OUT_OF_RANGE
);
155 bin48_to_felem(out
, b_out
);
159 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160 static BIGNUM
*felem_to_BN(BIGNUM
*out
, const felem in
)
162 felem_bytearray b_out
;
164 felem_to_bin48(b_out
, in
);
165 return BN_lebin2bn(b_out
, sizeof(b_out
), out
);
173 static void felem_one(felem out
)
176 memset(&out
[1], 0, sizeof(limb
) * (NLIMBS
-1));
179 static void felem_assign(felem out
, const felem in
)
181 memcpy(out
, in
, sizeof(felem
));
184 /* felem_sum64 sets out = out + in. */
185 static void felem_sum64(felem out
, const felem in
)
189 for (i
= 0; i
< NLIMBS
; i
++)
193 /* felem_scalar sets out = in * scalar */
194 static void felem_scalar(felem out
, const felem in
, limb scalar
)
198 for (i
= 0; i
< NLIMBS
; i
++)
199 out
[i
] = in
[i
] * scalar
;
202 /* felem_scalar64 sets out = out * scalar */
203 static void felem_scalar64(felem out
, limb scalar
)
207 for (i
= 0; i
< NLIMBS
; i
++)
211 /* felem_scalar128 sets out = out * scalar */
212 static void felem_scalar128(widefelem out
, limb scalar
)
216 for (i
= 0; i
< 2*NLIMBS
-1; i
++)
221 * felem_neg sets |out| to |-in|
223 * in[i] < 2^60 - 2^29
227 static void felem_neg(felem out
, const felem in
)
230 * In order to prevent underflow, we add a multiple of p before subtracting.
231 * Use telescopic sums to represent 2^12 * p redundantly with each limb
232 * of the form 2^60 + ...
234 static const limb two60m52m4
= (((limb
) 1) << 60)
237 static const limb two60p44m12
= (((limb
) 1) << 60)
239 - (((limb
) 1) << 12);
240 static const limb two60m28m4
= (((limb
) 1) << 60)
243 static const limb two60m4
= (((limb
) 1) << 60)
246 out
[0] = two60p44m12
- in
[0];
247 out
[1] = two60m52m4
- in
[1];
248 out
[2] = two60m28m4
- in
[2];
249 out
[3] = two60m4
- in
[3];
250 out
[4] = two60m4
- in
[4];
251 out
[5] = two60m4
- in
[5];
252 out
[6] = two60m4
- in
[6];
256 * felem_diff64 subtracts |in| from |out|
258 * in[i] < 2^60 - 2^52 - 2^4
260 * out[i] < out_orig[i] + 2^60 + 2^44
262 static void felem_diff64(felem out
, const felem in
)
265 * In order to prevent underflow, we add a multiple of p before subtracting.
266 * Use telescopic sums to represent 2^12 * p redundantly with each limb
267 * of the form 2^60 + ...
270 static const limb two60m52m4
= (((limb
) 1) << 60)
273 static const limb two60p44m12
= (((limb
) 1) << 60)
275 - (((limb
) 1) << 12);
276 static const limb two60m28m4
= (((limb
) 1) << 60)
279 static const limb two60m4
= (((limb
) 1) << 60)
282 out
[0] += two60p44m12
- in
[0];
283 out
[1] += two60m52m4
- in
[1];
284 out
[2] += two60m28m4
- in
[2];
285 out
[3] += two60m4
- in
[3];
286 out
[4] += two60m4
- in
[4];
287 out
[5] += two60m4
- in
[5];
288 out
[6] += two60m4
- in
[6];
293 * out[i] < out_orig[i] + 2^64 + 2^48
295 static void felem_diff_128_64(widefelem out
, const felem in
)
298 * In order to prevent underflow, we add a multiple of p before subtracting.
299 * Use telescopic sums to represent 2^16 * p redundantly with each limb
300 * of the form 2^64 + ...
303 static const widelimb two64m56m8
= (((widelimb
) 1) << 64)
304 - (((widelimb
) 1) << 56)
305 - (((widelimb
) 1) << 8);
306 static const widelimb two64m32m8
= (((widelimb
) 1) << 64)
307 - (((widelimb
) 1) << 32)
308 - (((widelimb
) 1) << 8);
309 static const widelimb two64m8
= (((widelimb
) 1) << 64)
310 - (((widelimb
) 1) << 8);
311 static const widelimb two64p48m16
= (((widelimb
) 1) << 64)
312 + (((widelimb
) 1) << 48)
313 - (((widelimb
) 1) << 16);
316 out
[0] += two64p48m16
;
317 out
[1] += two64m56m8
;
318 out
[2] += two64m32m8
;
324 for (i
= 0; i
< NLIMBS
; i
++)
329 * in[i] < 2^127 - 2^119 - 2^71
330 * out[i] < out_orig[i] + 2^127 + 2^111
332 static void felem_diff128(widefelem out
, const widefelem in
)
335 * In order to prevent underflow, we add a multiple of p before subtracting.
336 * Use telescopic sums to represent 2^415 * p redundantly with each limb
337 * of the form 2^127 + ...
340 static const widelimb two127
= ((widelimb
) 1) << 127;
341 static const widelimb two127m71
= (((widelimb
) 1) << 127)
342 - (((widelimb
) 1) << 71);
343 static const widelimb two127p111m79m71
= (((widelimb
) 1) << 127)
344 + (((widelimb
) 1) << 111)
345 - (((widelimb
) 1) << 79)
346 - (((widelimb
) 1) << 71);
347 static const widelimb two127m119m71
= (((widelimb
) 1) << 127)
348 - (((widelimb
) 1) << 119)
349 - (((widelimb
) 1) << 71);
350 static const widelimb two127m95m71
= (((widelimb
) 1) << 127)
351 - (((widelimb
) 1) << 95)
352 - (((widelimb
) 1) << 71);
361 out
[6] += two127p111m79m71
;
362 out
[7] += two127m119m71
;
363 out
[8] += two127m95m71
;
365 out
[10] += two127m71
;
366 out
[11] += two127m71
;
367 out
[12] += two127m71
;
369 for (i
= 0; i
< 2*NLIMBS
-1; i
++)
373 static void felem_square_ref(widefelem out
, const felem in
)
376 felem_scalar(inx2
, in
, 2);
378 out
[0] = ((uint128_t
) in
[0]) * in
[0];
380 out
[1] = ((uint128_t
) in
[0]) * inx2
[1];
382 out
[2] = ((uint128_t
) in
[0]) * inx2
[2]
383 + ((uint128_t
) in
[1]) * in
[1];
385 out
[3] = ((uint128_t
) in
[0]) * inx2
[3]
386 + ((uint128_t
) in
[1]) * inx2
[2];
388 out
[4] = ((uint128_t
) in
[0]) * inx2
[4]
389 + ((uint128_t
) in
[1]) * inx2
[3]
390 + ((uint128_t
) in
[2]) * in
[2];
392 out
[5] = ((uint128_t
) in
[0]) * inx2
[5]
393 + ((uint128_t
) in
[1]) * inx2
[4]
394 + ((uint128_t
) in
[2]) * inx2
[3];
396 out
[6] = ((uint128_t
) in
[0]) * inx2
[6]
397 + ((uint128_t
) in
[1]) * inx2
[5]
398 + ((uint128_t
) in
[2]) * inx2
[4]
399 + ((uint128_t
) in
[3]) * in
[3];
401 out
[7] = ((uint128_t
) in
[1]) * inx2
[6]
402 + ((uint128_t
) in
[2]) * inx2
[5]
403 + ((uint128_t
) in
[3]) * inx2
[4];
405 out
[8] = ((uint128_t
) in
[2]) * inx2
[6]
406 + ((uint128_t
) in
[3]) * inx2
[5]
407 + ((uint128_t
) in
[4]) * in
[4];
409 out
[9] = ((uint128_t
) in
[3]) * inx2
[6]
410 + ((uint128_t
) in
[4]) * inx2
[5];
412 out
[10] = ((uint128_t
) in
[4]) * inx2
[6]
413 + ((uint128_t
) in
[5]) * in
[5];
415 out
[11] = ((uint128_t
) in
[5]) * inx2
[6];
417 out
[12] = ((uint128_t
) in
[6]) * in
[6];
420 static void felem_mul_ref(widefelem out
, const felem in1
, const felem in2
)
422 out
[0] = ((uint128_t
) in1
[0]) * in2
[0];
424 out
[1] = ((uint128_t
) in1
[0]) * in2
[1]
425 + ((uint128_t
) in1
[1]) * in2
[0];
427 out
[2] = ((uint128_t
) in1
[0]) * in2
[2]
428 + ((uint128_t
) in1
[1]) * in2
[1]
429 + ((uint128_t
) in1
[2]) * in2
[0];
431 out
[3] = ((uint128_t
) in1
[0]) * in2
[3]
432 + ((uint128_t
) in1
[1]) * in2
[2]
433 + ((uint128_t
) in1
[2]) * in2
[1]
434 + ((uint128_t
) in1
[3]) * in2
[0];
436 out
[4] = ((uint128_t
) in1
[0]) * in2
[4]
437 + ((uint128_t
) in1
[1]) * in2
[3]
438 + ((uint128_t
) in1
[2]) * in2
[2]
439 + ((uint128_t
) in1
[3]) * in2
[1]
440 + ((uint128_t
) in1
[4]) * in2
[0];
442 out
[5] = ((uint128_t
) in1
[0]) * in2
[5]
443 + ((uint128_t
) in1
[1]) * in2
[4]
444 + ((uint128_t
) in1
[2]) * in2
[3]
445 + ((uint128_t
) in1
[3]) * in2
[2]
446 + ((uint128_t
) in1
[4]) * in2
[1]
447 + ((uint128_t
) in1
[5]) * in2
[0];
449 out
[6] = ((uint128_t
) in1
[0]) * in2
[6]
450 + ((uint128_t
) in1
[1]) * in2
[5]
451 + ((uint128_t
) in1
[2]) * in2
[4]
452 + ((uint128_t
) in1
[3]) * in2
[3]
453 + ((uint128_t
) in1
[4]) * in2
[2]
454 + ((uint128_t
) in1
[5]) * in2
[1]
455 + ((uint128_t
) in1
[6]) * in2
[0];
457 out
[7] = ((uint128_t
) in1
[1]) * in2
[6]
458 + ((uint128_t
) in1
[2]) * in2
[5]
459 + ((uint128_t
) in1
[3]) * in2
[4]
460 + ((uint128_t
) in1
[4]) * in2
[3]
461 + ((uint128_t
) in1
[5]) * in2
[2]
462 + ((uint128_t
) in1
[6]) * in2
[1];
464 out
[8] = ((uint128_t
) in1
[2]) * in2
[6]
465 + ((uint128_t
) in1
[3]) * in2
[5]
466 + ((uint128_t
) in1
[4]) * in2
[4]
467 + ((uint128_t
) in1
[5]) * in2
[3]
468 + ((uint128_t
) in1
[6]) * in2
[2];
470 out
[9] = ((uint128_t
) in1
[3]) * in2
[6]
471 + ((uint128_t
) in1
[4]) * in2
[5]
472 + ((uint128_t
) in1
[5]) * in2
[4]
473 + ((uint128_t
) in1
[6]) * in2
[3];
475 out
[10] = ((uint128_t
) in1
[4]) * in2
[6]
476 + ((uint128_t
) in1
[5]) * in2
[5]
477 + ((uint128_t
) in1
[6]) * in2
[4];
479 out
[11] = ((uint128_t
) in1
[5]) * in2
[6]
480 + ((uint128_t
) in1
[6]) * in2
[5];
482 out
[12] = ((uint128_t
) in1
[6]) * in2
[6];
486 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487 * in[i] < 2^128 - 2^125
488 * out[i] < 2^56 for i < 6,
491 * The technique in use here stems from the format of the prime modulus:
492 * P384 = 2^384 - delta
494 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496 * coefficients are still quite large, and so we repeatedly apply this
497 * technique on high-order bits in order to guarantee the desired bounds on
498 * the size of our output.
500 * The three phases of elimination are as follows:
501 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
502 * [2]: Y = 2^8 (acc[8] | acc[7])
503 * [3]: Y = 2^48 (acc[6] >> 48)
504 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
506 static void felem_reduce(felem out
, const widefelem in
)
509 * In order to prevent underflow, we add a multiple of p before subtracting.
510 * Use telescopic sums to represent 2^76 * p redundantly with each limb
511 * of the form 2^124 + ...
513 static const widelimb two124m68
= (((widelimb
) 1) << 124)
514 - (((widelimb
) 1) << 68);
515 static const widelimb two124m116m68
= (((widelimb
) 1) << 124)
516 - (((widelimb
) 1) << 116)
517 - (((widelimb
) 1) << 68);
518 static const widelimb two124p108m76
= (((widelimb
) 1) << 124)
519 + (((widelimb
) 1) << 108)
520 - (((widelimb
) 1) << 76);
521 static const widelimb two124m92m68
= (((widelimb
) 1) << 124)
522 - (((widelimb
) 1) << 92)
523 - (((widelimb
) 1) << 68);
524 widelimb temp
, acc
[9];
527 memcpy(acc
, in
, sizeof(widelimb
) * 9);
529 acc
[0] += two124p108m76
;
530 acc
[1] += two124m116m68
;
531 acc
[2] += two124m92m68
;
537 /* [1]: Eliminate in[9], ..., in[12] */
538 acc
[8] += in
[12] >> 32;
539 acc
[7] += (in
[12] & 0xffffffff) << 24;
540 acc
[7] += in
[12] >> 8;
541 acc
[6] += (in
[12] & 0xff) << 48;
542 acc
[6] -= in
[12] >> 16;
543 acc
[5] -= (in
[12] & 0xffff) << 40;
544 acc
[6] += in
[12] >> 48;
545 acc
[5] += (in
[12] & 0xffffffffffff) << 8;
547 acc
[7] += in
[11] >> 32;
548 acc
[6] += (in
[11] & 0xffffffff) << 24;
549 acc
[6] += in
[11] >> 8;
550 acc
[5] += (in
[11] & 0xff) << 48;
551 acc
[5] -= in
[11] >> 16;
552 acc
[4] -= (in
[11] & 0xffff) << 40;
553 acc
[5] += in
[11] >> 48;
554 acc
[4] += (in
[11] & 0xffffffffffff) << 8;
556 acc
[6] += in
[10] >> 32;
557 acc
[5] += (in
[10] & 0xffffffff) << 24;
558 acc
[5] += in
[10] >> 8;
559 acc
[4] += (in
[10] & 0xff) << 48;
560 acc
[4] -= in
[10] >> 16;
561 acc
[3] -= (in
[10] & 0xffff) << 40;
562 acc
[4] += in
[10] >> 48;
563 acc
[3] += (in
[10] & 0xffffffffffff) << 8;
565 acc
[5] += in
[9] >> 32;
566 acc
[4] += (in
[9] & 0xffffffff) << 24;
567 acc
[4] += in
[9] >> 8;
568 acc
[3] += (in
[9] & 0xff) << 48;
569 acc
[3] -= in
[9] >> 16;
570 acc
[2] -= (in
[9] & 0xffff) << 40;
571 acc
[3] += in
[9] >> 48;
572 acc
[2] += (in
[9] & 0xffffffffffff) << 8;
575 * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576 * well as the contributions made from eliminating higher limbs.
577 * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578 * acc[8] < in[8] + 2^96
580 acc
[4] += acc
[8] >> 32;
581 acc
[3] += (acc
[8] & 0xffffffff) << 24;
582 acc
[3] += acc
[8] >> 8;
583 acc
[2] += (acc
[8] & 0xff) << 48;
584 acc
[2] -= acc
[8] >> 16;
585 acc
[1] -= (acc
[8] & 0xffff) << 40;
586 acc
[2] += acc
[8] >> 48;
587 acc
[1] += (acc
[8] & 0xffffffffffff) << 8;
589 acc
[3] += acc
[7] >> 32;
590 acc
[2] += (acc
[7] & 0xffffffff) << 24;
591 acc
[2] += acc
[7] >> 8;
592 acc
[1] += (acc
[7] & 0xff) << 48;
593 acc
[1] -= acc
[7] >> 16;
594 acc
[0] -= (acc
[7] & 0xffff) << 40;
595 acc
[1] += acc
[7] >> 48;
596 acc
[0] += (acc
[7] & 0xffffffffffff) << 8;
599 * acc[k] < in[k] + 2^124 + 2^121
601 * < 2^128, for k <= 6
606 * This has the effect of ensuring that these more significant limbs
607 * will be small in value after eliminating high bits from acc[6].
609 acc
[5] += acc
[4] >> 56;
610 acc
[4] &= 0x00ffffffffffffff;
612 acc
[6] += acc
[5] >> 56;
613 acc
[5] &= 0x00ffffffffffffff;
616 * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
621 /* [3]: Eliminate high bits of acc[6] */
623 acc
[6] &= 0x0000ffffffffffff;
627 acc
[3] += temp
>> 40;
628 acc
[2] += (temp
& 0xffffffffff) << 16;
629 acc
[2] += temp
>> 16;
630 acc
[1] += (temp
& 0xffff) << 40;
631 acc
[1] -= temp
>> 24;
632 acc
[0] -= (temp
& 0xffffff) << 32;
636 * acc[k] < acc_old[k] + 2^64 + 2^56
637 * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
640 /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641 acc
[1] += acc
[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
642 acc
[0] &= 0x00ffffffffffffff;
644 acc
[2] += acc
[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645 acc
[1] &= 0x00ffffffffffffff;
647 acc
[3] += acc
[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648 acc
[2] &= 0x00ffffffffffffff;
651 * acc[k] < acc_old[k] + 2^72 + 2^16
652 * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
657 acc
[4] += acc
[3] >> 56; /*-
658 * acc[4] < acc_old[4] + 2^72 + 2^16
659 * < 2^72 + 2^56 + 2^16
661 acc
[3] &= 0x00ffffffffffffff;
663 acc
[5] += acc
[4] >> 56; /*-
664 * acc[5] < acc_old[5] + 2^16 + 1
667 acc
[4] &= 0x00ffffffffffffff;
669 acc
[6] += acc
[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
670 acc
[5] &= 0x00ffffffffffffff;
672 for (i
= 0; i
< NLIMBS
; i
++)
676 #if defined(ECP_NISTP384_ASM)
677 static void felem_square_wrapper(widefelem out
, const felem in
);
678 static void felem_mul_wrapper(widefelem out
, const felem in1
, const felem in2
);
680 static void (*felem_square_p
)(widefelem out
, const felem in
) =
681 felem_square_wrapper
;
682 static void (*felem_mul_p
)(widefelem out
, const felem in1
, const felem in2
) =
685 void p384_felem_square(widefelem out
, const felem in
);
686 void p384_felem_mul(widefelem out
, const felem in1
, const felem in2
);
688 # if defined(_ARCH_PPC64)
689 # include "crypto/ppc_arch.h"
692 static void felem_select(void)
694 # if defined(_ARCH_PPC64)
695 if ((OPENSSL_ppccap_P
& PPC_MADD300
) && (OPENSSL_ppccap_P
& PPC_ALTIVEC
)) {
696 felem_square_p
= p384_felem_square
;
697 felem_mul_p
= p384_felem_mul
;
704 felem_square_p
= felem_square_ref
;
705 felem_mul_p
= felem_mul_ref
;
708 static void felem_square_wrapper(widefelem out
, const felem in
)
711 felem_square_p(out
, in
);
714 static void felem_mul_wrapper(widefelem out
, const felem in1
, const felem in2
)
717 felem_mul_p(out
, in1
, in2
);
720 # define felem_square felem_square_p
721 # define felem_mul felem_mul_p
723 # define felem_square felem_square_ref
724 # define felem_mul felem_mul_ref
727 static ossl_inline
void felem_square_reduce(felem out
, const felem in
)
731 felem_square(tmp
, in
);
732 felem_reduce(out
, tmp
);
735 static ossl_inline
void felem_mul_reduce(felem out
, const felem in1
, const felem in2
)
739 felem_mul(tmp
, in1
, in2
);
740 felem_reduce(out
, tmp
);
744 * felem_inv calculates |out| = |in|^{-1}
746 * Based on Fermat's Little Theorem:
748 * a^{p-1} = 1 (mod p)
749 * a^{p-2} = a^{-1} (mod p)
751 static void felem_inv(felem out
, const felem in
)
753 felem ftmp
, ftmp2
, ftmp3
, ftmp4
, ftmp5
, ftmp6
;
756 felem_square_reduce(ftmp
, in
); /* 2^1 */
757 felem_mul_reduce(ftmp
, ftmp
, in
); /* 2^1 + 2^0 */
758 felem_assign(ftmp2
, ftmp
);
760 felem_square_reduce(ftmp
, ftmp
); /* 2^2 + 2^1 */
761 felem_mul_reduce(ftmp
, ftmp
, in
); /* 2^2 + 2^1 * 2^0 */
762 felem_assign(ftmp3
, ftmp
);
764 for (i
= 0; i
< 3; i
++)
765 felem_square_reduce(ftmp
, ftmp
); /* 2^5 + 2^4 + 2^3 */
766 felem_mul_reduce(ftmp
, ftmp3
, ftmp
); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767 felem_assign(ftmp4
, ftmp
);
769 for (i
= 0; i
< 6; i
++)
770 felem_square_reduce(ftmp
, ftmp
); /* 2^11 + ... + 2^6 */
771 felem_mul_reduce(ftmp
, ftmp4
, ftmp
); /* 2^11 + ... + 2^0 */
773 for (i
= 0; i
< 3; i
++)
774 felem_square_reduce(ftmp
, ftmp
); /* 2^14 + ... + 2^3 */
775 felem_mul_reduce(ftmp
, ftmp3
, ftmp
); /* 2^14 + ... + 2^0 */
776 felem_assign(ftmp5
, ftmp
);
778 for (i
= 0; i
< 15; i
++)
779 felem_square_reduce(ftmp
, ftmp
); /* 2^29 + ... + 2^15 */
780 felem_mul_reduce(ftmp
, ftmp5
, ftmp
); /* 2^29 + ... + 2^0 */
781 felem_assign(ftmp6
, ftmp
);
783 for (i
= 0; i
< 30; i
++)
784 felem_square_reduce(ftmp
, ftmp
); /* 2^59 + ... + 2^30 */
785 felem_mul_reduce(ftmp
, ftmp6
, ftmp
); /* 2^59 + ... + 2^0 */
786 felem_assign(ftmp4
, ftmp
);
788 for (i
= 0; i
< 60; i
++)
789 felem_square_reduce(ftmp
, ftmp
); /* 2^119 + ... + 2^60 */
790 felem_mul_reduce(ftmp
, ftmp4
, ftmp
); /* 2^119 + ... + 2^0 */
791 felem_assign(ftmp4
, ftmp
);
793 for (i
= 0; i
< 120; i
++)
794 felem_square_reduce(ftmp
, ftmp
); /* 2^239 + ... + 2^120 */
795 felem_mul_reduce(ftmp
, ftmp4
, ftmp
); /* 2^239 + ... + 2^0 */
797 for (i
= 0; i
< 15; i
++)
798 felem_square_reduce(ftmp
, ftmp
); /* 2^254 + ... + 2^15 */
799 felem_mul_reduce(ftmp
, ftmp5
, ftmp
); /* 2^254 + ... + 2^0 */
801 for (i
= 0; i
< 31; i
++)
802 felem_square_reduce(ftmp
, ftmp
); /* 2^285 + ... + 2^31 */
803 felem_mul_reduce(ftmp
, ftmp6
, ftmp
); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
805 for (i
= 0; i
< 2; i
++)
806 felem_square_reduce(ftmp
, ftmp
); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807 felem_mul_reduce(ftmp
, ftmp2
, ftmp
); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
809 for (i
= 0; i
< 94; i
++)
810 felem_square_reduce(ftmp
, ftmp
); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811 felem_mul_reduce(ftmp
, ftmp6
, ftmp
); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
813 for (i
= 0; i
< 2; i
++)
814 felem_square_reduce(ftmp
, ftmp
); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815 felem_mul_reduce(ftmp
, in
, ftmp
); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
817 memcpy(out
, ftmp
, sizeof(felem
));
821 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
822 * and 0 otherwise. We know that field elements are reduced to
823 * 0 < in < 2p, so we only need to check two cases:
824 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825 * in[k] < 2^56, k < 6
828 static limb
felem_is_zero(const felem in
)
832 zero
= in
[0] | in
[1] | in
[2] | in
[3] | in
[4] | in
[5] | in
[6];
833 zero
= ((int64_t) (zero
) - 1) >> 63;
834 p384
= (in
[0] ^ 0x000000ffffffff) | (in
[1] ^ 0xffff0000000000)
835 | (in
[2] ^ 0xfffffffffeffff) | (in
[3] ^ 0xffffffffffffff)
836 | (in
[4] ^ 0xffffffffffffff) | (in
[5] ^ 0xffffffffffffff)
837 | (in
[6] ^ 0xffffffffffff);
838 p384
= ((int64_t) (p384
) - 1) >> 63;
840 return (zero
| p384
);
843 static int felem_is_zero_int(const void *in
)
845 return (int)(felem_is_zero(in
) & ((limb
) 1));
849 * felem_contract converts |in| to its unique, minimal representation.
850 * Assume we've removed all redundant bits.
852 * in[k] < 2^56, k < 6
855 static void felem_contract(felem out
, const felem in
)
857 static const int64_t two56
= ((limb
) 1) << 56;
860 * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861 * Perform two successive, idempotent subtractions to reduce if |in| >= p.
864 int64_t tmp
[NLIMBS
], cond
[5], a
;
867 memcpy(tmp
, in
, sizeof(felem
));
869 /* Case 1: a = 1 iff |in| >= 2^384 */
875 tmp
[6] &= 0x0000ffffffffffff;
878 * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879 * non-zero, so we only need one step
886 /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887 tmp
[2] += tmp
[1] >> 56;
888 tmp
[1] &= 0x00ffffffffffffff;
890 tmp
[3] += tmp
[2] >> 56;
891 tmp
[2] &= 0x00ffffffffffffff;
893 tmp
[4] += tmp
[3] >> 56;
894 tmp
[3] &= 0x00ffffffffffffff;
896 tmp
[5] += tmp
[4] >> 56;
897 tmp
[4] &= 0x00ffffffffffffff;
899 tmp
[6] += tmp
[5] >> 56; /* tmp[6] < 2^48 */
900 tmp
[5] &= 0x00ffffffffffffff;
903 * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
906 /* 0 iff (2^129..2^383) are all one */
907 cond
[0] = ((tmp
[6] | 0xff000000000000) & tmp
[5] & tmp
[4] & tmp
[3] & (tmp
[2] | 0x0000000001ffff)) + 1;
908 /* 0 iff 2^128 bit is one */
909 cond
[1] = (tmp
[2] | ~0x00000000010000) + 1;
910 /* 0 iff (2^96..2^127) bits are all one */
911 cond
[2] = ((tmp
[2] | 0xffffffffff0000) & (tmp
[1] | 0x0000ffffffffff)) + 1;
912 /* 0 iff (2^32..2^95) bits are all zero */
913 cond
[3] = (tmp
[1] & ~0xffff0000000000) | (tmp
[0] & ~((int64_t) 0x000000ffffffff));
914 /* 0 iff (2^0..2^31) bits are all one */
915 cond
[4] = (tmp
[0] | 0xffffff00000000) + 1;
918 * In effect, invert our conditions, so that 0 values become all 1's,
919 * any non-zero value in the low-order 56 bits becomes all 0's
921 for (i
= 0; i
< 5; i
++)
922 cond
[i
] = ((cond
[i
] & 0x00ffffffffffffff) - 1) >> 63;
925 * The condition for determining whether in is greater than our
926 * prime is given by the following condition.
929 /* First subtract 2^384 - 2^129 cheaply */
930 a
= cond
[0] & (cond
[1] | (cond
[2] & (~cond
[3] | cond
[4])));
935 tmp
[2] &= ~a
| 0x0000000001ffff;
938 * Subtract 2^128 - 2^96 by
939 * means of disjoint cases.
942 /* subtract 2^128 if that bit is present, and add 2^96 */
943 a
= cond
[0] & cond
[1];
944 tmp
[2] &= ~a
| 0xfffffffffeffff;
945 tmp
[1] += a
& ((int64_t) 1 << 40);
947 /* otherwise, clear bits 2^127 .. 2^96 */
948 a
= cond
[0] & ~cond
[1] & (cond
[2] & (~cond
[3] | cond
[4]));
949 tmp
[2] &= ~a
| 0xffffffffff0000;
950 tmp
[1] &= ~a
| 0x0000ffffffffff;
952 /* finally, subtract the last 2^32 - 1 */
953 a
= cond
[0] & (cond
[1] | (cond
[2] & (~cond
[3] | cond
[4])));
954 tmp
[0] += a
& (-((int64_t) 1 << 32) + 1);
957 * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958 * non-zero, so we only need one step
964 /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965 tmp
[2] += tmp
[1] >> 56;
966 tmp
[1] &= 0x00ffffffffffffff;
968 tmp
[3] += tmp
[2] >> 56;
969 tmp
[2] &= 0x00ffffffffffffff;
971 tmp
[4] += tmp
[3] >> 56;
972 tmp
[3] &= 0x00ffffffffffffff;
974 tmp
[5] += tmp
[4] >> 56;
975 tmp
[4] &= 0x00ffffffffffffff;
977 tmp
[6] += tmp
[5] >> 56;
978 tmp
[5] &= 0x00ffffffffffffff;
980 memcpy(out
, tmp
, sizeof(felem
));
987 * Building on top of the field operations we have the operations on the
988 * elliptic curve group itself. Points on the curve are represented in Jacobian
993 * point_double calculates 2*(x_in, y_in, z_in)
995 * The method is taken from:
996 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
998 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999 * while x_out == y_in is not (maybe this works, but it's not tested).
1002 point_double(felem x_out
, felem y_out
, felem z_out
,
1003 const felem x_in
, const felem y_in
, const felem z_in
)
1005 widefelem tmp
, tmp2
;
1006 felem delta
, gamma
, beta
, alpha
, ftmp
, ftmp2
;
1008 felem_assign(ftmp
, x_in
);
1009 felem_assign(ftmp2
, x_in
);
1012 felem_square_reduce(delta
, z_in
); /* delta[i] < 2^56 */
1015 felem_square_reduce(gamma
, y_in
); /* gamma[i] < 2^56 */
1017 /* beta = x*gamma */
1018 felem_mul_reduce(beta
, x_in
, gamma
); /* beta[i] < 2^56 */
1020 /* alpha = 3*(x-delta)*(x+delta) */
1021 felem_diff64(ftmp
, delta
); /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022 felem_sum64(ftmp2
, delta
); /* ftmp2[i] < 2^59 */
1023 felem_scalar64(ftmp2
, 3); /* ftmp2[i] < 2^61 */
1024 felem_mul_reduce(alpha
, ftmp
, ftmp2
); /* alpha[i] < 2^56 */
1026 /* x' = alpha^2 - 8*beta */
1027 felem_square(tmp
, alpha
); /* tmp[i] < 2^115 */
1028 felem_assign(ftmp
, beta
); /* ftmp[i] < 2^56 */
1029 felem_scalar64(ftmp
, 8); /* ftmp[i] < 2^59 */
1030 felem_diff_128_64(tmp
, ftmp
); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031 felem_reduce(x_out
, tmp
); /* x_out[i] < 2^56 */
1033 /* z' = (y + z)^2 - gamma - delta */
1034 felem_sum64(delta
, gamma
); /* delta[i] < 2^57 */
1035 felem_assign(ftmp
, y_in
); /* ftmp[i] < 2^56 */
1036 felem_sum64(ftmp
, z_in
); /* ftmp[i] < 2^56 */
1037 felem_square(tmp
, ftmp
); /* tmp[i] < 2^115 */
1038 felem_diff_128_64(tmp
, delta
); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039 felem_reduce(z_out
, tmp
); /* z_out[i] < 2^56 */
1041 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1042 felem_scalar64(beta
, 4); /* beta[i] < 2^58 */
1043 felem_diff64(beta
, x_out
); /* beta[i] < 2^60 + 2^58 + 2^44 */
1044 felem_mul(tmp
, alpha
, beta
); /* tmp[i] < 2^119 */
1045 felem_square(tmp2
, gamma
); /* tmp2[i] < 2^115 */
1046 felem_scalar128(tmp2
, 8); /* tmp2[i] < 2^118 */
1047 felem_diff128(tmp
, tmp2
); /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048 felem_reduce(y_out
, tmp
); /* tmp[i] < 2^56 */
1051 /* copy_conditional copies in to out iff mask is all ones. */
1052 static void copy_conditional(felem out
, const felem in
, limb mask
)
1056 for (i
= 0; i
< NLIMBS
; i
++)
1057 out
[i
] ^= mask
& (in
[i
] ^ out
[i
]);
1061 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1063 * The method is taken from
1064 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1067 * This function includes a branch for checking whether the two input points
1068 * are equal (while not equal to the point at infinity). See comment below
1071 static void point_add(felem x3
, felem y3
, felem z3
,
1072 const felem x1
, const felem y1
, const felem z1
,
1073 const int mixed
, const felem x2
, const felem y2
,
1076 felem ftmp
, ftmp2
, ftmp3
, ftmp4
, ftmp5
, ftmp6
, x_out
, y_out
, z_out
;
1077 widefelem tmp
, tmp2
;
1078 limb x_equal
, y_equal
, z1_is_zero
, z2_is_zero
;
1081 z1_is_zero
= felem_is_zero(z1
);
1082 z2_is_zero
= felem_is_zero(z2
);
1084 /* ftmp = z1z1 = z1**2 */
1085 felem_square_reduce(ftmp
, z1
); /* ftmp[i] < 2^56 */
1088 /* ftmp2 = z2z2 = z2**2 */
1089 felem_square_reduce(ftmp2
, z2
); /* ftmp2[i] < 2^56 */
1091 /* u1 = ftmp3 = x1*z2z2 */
1092 felem_mul_reduce(ftmp3
, x1
, ftmp2
); /* ftmp3[i] < 2^56 */
1094 /* ftmp5 = z1 + z2 */
1095 felem_assign(ftmp5
, z1
); /* ftmp5[i] < 2^56 */
1096 felem_sum64(ftmp5
, z2
); /* ftmp5[i] < 2^57 */
1098 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1099 felem_square(tmp
, ftmp5
); /* tmp[i] < 2^117 */
1100 felem_diff_128_64(tmp
, ftmp
); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101 felem_diff_128_64(tmp
, ftmp2
); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102 felem_reduce(ftmp5
, tmp
); /* ftmp5[i] < 2^56 */
1104 /* ftmp2 = z2 * z2z2 */
1105 felem_mul_reduce(ftmp2
, ftmp2
, z2
); /* ftmp2[i] < 2^56 */
1107 /* s1 = ftmp6 = y1 * z2**3 */
1108 felem_mul_reduce(ftmp6
, y1
, ftmp2
); /* ftmp6[i] < 2^56 */
1111 * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1114 /* u1 = ftmp3 = x1*z2z2 */
1115 felem_assign(ftmp3
, x1
); /* ftmp3[i] < 2^56 */
1117 /* ftmp5 = 2*z1z2 */
1118 felem_scalar(ftmp5
, z1
, 2); /* ftmp5[i] < 2^57 */
1120 /* s1 = ftmp6 = y1 * z2**3 */
1121 felem_assign(ftmp6
, y1
); /* ftmp6[i] < 2^56 */
1123 /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1126 felem_mul(tmp
, x2
, ftmp
); /* tmp[i] < 2^115 */
1128 /* h = ftmp4 = u2 - u1 */
1129 felem_diff_128_64(tmp
, ftmp3
); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130 felem_reduce(ftmp4
, tmp
); /* ftmp[4] < 2^56 */
1132 x_equal
= felem_is_zero(ftmp4
);
1134 /* z_out = ftmp5 * h */
1135 felem_mul_reduce(z_out
, ftmp5
, ftmp4
); /* z_out[i] < 2^56 */
1137 /* ftmp = z1 * z1z1 */
1138 felem_mul_reduce(ftmp
, ftmp
, z1
); /* ftmp[i] < 2^56 */
1140 /* s2 = tmp = y2 * z1**3 */
1141 felem_mul(tmp
, y2
, ftmp
); /* tmp[i] < 2^115 */
1143 /* r = ftmp5 = (s2 - s1)*2 */
1144 felem_diff_128_64(tmp
, ftmp6
); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145 felem_reduce(ftmp5
, tmp
); /* ftmp5[i] < 2^56 */
1146 y_equal
= felem_is_zero(ftmp5
);
1147 felem_scalar64(ftmp5
, 2); /* ftmp5[i] < 2^57 */
1150 * The formulae are incorrect if the points are equal, in affine coordinates
1151 * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1154 * We use bitwise operations to avoid potential side-channels introduced by
1155 * the short-circuiting behaviour of boolean operators.
1157 * The special case of either point being the point at infinity (z1 and/or
1158 * z2 are zero), is handled separately later on in this function, so we
1159 * avoid jumping to point_double here in those special cases.
1161 * Notice the comment below on the implications of this branching for timing
1162 * leaks and why it is considered practically irrelevant.
1164 points_equal
= (x_equal
& y_equal
& (~z1_is_zero
) & (~z2_is_zero
));
1168 * This is obviously not constant-time but it will almost-never happen
1171 point_double(x3
, y3
, z3
, x1
, y1
, z1
);
1175 /* I = ftmp = (2h)**2 */
1176 felem_assign(ftmp
, ftmp4
); /* ftmp[i] < 2^56 */
1177 felem_scalar64(ftmp
, 2); /* ftmp[i] < 2^57 */
1178 felem_square_reduce(ftmp
, ftmp
); /* ftmp[i] < 2^56 */
1180 /* J = ftmp2 = h * I */
1181 felem_mul_reduce(ftmp2
, ftmp4
, ftmp
); /* ftmp2[i] < 2^56 */
1183 /* V = ftmp4 = U1 * I */
1184 felem_mul_reduce(ftmp4
, ftmp3
, ftmp
); /* ftmp4[i] < 2^56 */
1186 /* x_out = r**2 - J - 2V */
1187 felem_square(tmp
, ftmp5
); /* tmp[i] < 2^117 */
1188 felem_diff_128_64(tmp
, ftmp2
); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189 felem_assign(ftmp3
, ftmp4
); /* ftmp3[i] < 2^56 */
1190 felem_scalar64(ftmp4
, 2); /* ftmp4[i] < 2^57 */
1191 felem_diff_128_64(tmp
, ftmp4
); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192 felem_reduce(x_out
, tmp
); /* x_out[i] < 2^56 */
1194 /* y_out = r(V-x_out) - 2 * s1 * J */
1195 felem_diff64(ftmp3
, x_out
); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196 felem_mul(tmp
, ftmp5
, ftmp3
); /* tmp[i] < 2^116 */
1197 felem_mul(tmp2
, ftmp6
, ftmp2
); /* tmp2[i] < 2^115 */
1198 felem_scalar128(tmp2
, 2); /* tmp2[i] < 2^116 */
1199 felem_diff128(tmp
, tmp2
); /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200 felem_reduce(y_out
, tmp
); /* y_out[i] < 2^56 */
1202 copy_conditional(x_out
, x2
, z1_is_zero
);
1203 copy_conditional(x_out
, x1
, z2_is_zero
);
1204 copy_conditional(y_out
, y2
, z1_is_zero
);
1205 copy_conditional(y_out
, y1
, z2_is_zero
);
1206 copy_conditional(z_out
, z2
, z1_is_zero
);
1207 copy_conditional(z_out
, z1
, z2_is_zero
);
1208 felem_assign(x3
, x_out
);
1209 felem_assign(y3
, y_out
);
1210 felem_assign(z3
, z_out
);
1214 * Base point pre computation
1215 * --------------------------
1217 * Two different sorts of precomputed tables are used in the following code.
1218 * Each contain various points on the curve, where each point is three field
1219 * elements (x, y, z).
1221 * For the base point table, z is usually 1 (0 for the point at infinity).
1222 * This table has 16 elements:
1223 * index | bits | point
1224 * ------+---------+------------------------------
1227 * 2 | 0 0 1 0 | 2^95G
1228 * 3 | 0 0 1 1 | (2^95 + 1)G
1229 * 4 | 0 1 0 0 | 2^190G
1230 * 5 | 0 1 0 1 | (2^190 + 1)G
1231 * 6 | 0 1 1 0 | (2^190 + 2^95)G
1232 * 7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1233 * 8 | 1 0 0 0 | 2^285G
1234 * 9 | 1 0 0 1 | (2^285 + 1)G
1235 * 10 | 1 0 1 0 | (2^285 + 2^95)G
1236 * 11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1237 * 12 | 1 1 0 0 | (2^285 + 2^190)G
1238 * 13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1239 * 14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1240 * 15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1242 * The reason for this is so that we can clock bits into four different
1243 * locations when doing simple scalar multiplies against the base point.
1245 * Tables for other points have table[i] = iG for i in 0 .. 16.
1248 /* gmul is the table of precomputed base points */
1249 static const felem gmul
[16][3] = {
1250 {{0, 0, 0, 0, 0, 0, 0},
1251 {0, 0, 0, 0, 0, 0, 0},
1252 {0, 0, 0, 0, 0, 0, 0}},
1253 {{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254 0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256 0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257 {1, 0, 0, 0, 0, 0, 0}},
1258 {{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259 0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261 0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262 {1, 0, 0, 0, 0, 0, 0}},
1263 {{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264 0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266 0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267 {1, 0, 0, 0, 0, 0, 0}},
1268 {{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269 0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271 0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272 {1, 0, 0, 0, 0, 0, 0}},
1273 {{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274 0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276 0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277 {1, 0, 0, 0, 0, 0, 0}},
1278 {{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279 0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281 0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282 {1, 0, 0, 0, 0, 0, 0}},
1283 {{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284 0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286 0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287 {1, 0, 0, 0, 0, 0, 0}},
1288 {{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289 0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291 0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292 {1, 0, 0, 0, 0, 0, 0}},
1293 {{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294 0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296 0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297 {1, 0, 0, 0, 0, 0, 0}},
1298 {{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299 0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301 0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302 {1, 0, 0, 0, 0, 0, 0}},
1303 {{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304 0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306 0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307 {1, 0, 0, 0, 0, 0, 0}},
1308 {{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309 0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311 0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312 {1, 0, 0, 0, 0, 0, 0}},
1313 {{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314 0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316 0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317 {1, 0, 0, 0, 0, 0, 0}},
1318 {{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319 0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321 0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322 {1, 0, 0, 0, 0, 0, 0}},
1323 {{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324 0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326 0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327 {1, 0, 0, 0, 0, 0, 0}}
1331 * select_point selects the |idx|th point from a precomputation table and
1334 * pre_comp below is of the size provided in |size|.
1336 static void select_point(const limb idx
, unsigned int size
,
1337 const felem pre_comp
[][3], felem out
[3])
1340 limb
*outlimbs
= &out
[0][0];
1342 memset(out
, 0, sizeof(*out
) * 3);
1344 for (i
= 0; i
< size
; i
++) {
1345 const limb
*inlimbs
= &pre_comp
[i
][0][0];
1346 limb mask
= i
^ idx
;
1353 for (j
= 0; j
< NLIMBS
* 3; j
++)
1354 outlimbs
[j
] |= inlimbs
[j
] & mask
;
1358 /* get_bit returns the |i|th bit in |in| */
1359 static char get_bit(const felem_bytearray in
, int i
)
1361 if (i
< 0 || i
>= 384)
1363 return (in
[i
>> 3] >> (i
& 7)) & 1;
1367 * Interleaved point multiplication using precomputed point multiples: The
1368 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1369 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370 * generator, using certain (large) precomputed multiples in g_pre_comp.
1371 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1373 static void batch_mul(felem x_out
, felem y_out
, felem z_out
,
1374 const felem_bytearray scalars
[],
1375 const unsigned int num_points
, const u8
*g_scalar
,
1376 const int mixed
, const felem pre_comp
[][17][3],
1377 const felem g_pre_comp
[16][3])
1380 unsigned int num
, gen_mul
= (g_scalar
!= NULL
);
1381 felem nq
[3], tmp
[4];
1385 /* set nq to the point at infinity */
1386 memset(nq
, 0, sizeof(nq
));
1389 * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390 * of the generator (last quarter of rounds) and additions of other
1391 * points multiples (every 5th round).
1393 skip
= 1; /* save two point operations in the first
1395 for (i
= (num_points
? 380 : 98); i
>= 0; --i
) {
1398 point_double(nq
[0], nq
[1], nq
[2], nq
[0], nq
[1], nq
[2]);
1400 /* add multiples of the generator */
1401 if (gen_mul
&& (i
<= 98)) {
1402 bits
= get_bit(g_scalar
, i
+ 285) << 3;
1404 bits
|= get_bit(g_scalar
, i
+ 190) << 2;
1405 bits
|= get_bit(g_scalar
, i
+ 95) << 1;
1406 bits
|= get_bit(g_scalar
, i
);
1408 /* select the point to add, in constant time */
1409 select_point(bits
, 16, g_pre_comp
, tmp
);
1411 /* The 1 argument below is for "mixed" */
1412 point_add(nq
[0], nq
[1], nq
[2],
1413 nq
[0], nq
[1], nq
[2], 1,
1414 tmp
[0], tmp
[1], tmp
[2]);
1416 memcpy(nq
, tmp
, 3 * sizeof(felem
));
1421 /* do other additions every 5 doublings */
1422 if (num_points
&& (i
% 5 == 0)) {
1423 /* loop over all scalars */
1424 for (num
= 0; num
< num_points
; ++num
) {
1425 bits
= get_bit(scalars
[num
], i
+ 4) << 5;
1426 bits
|= get_bit(scalars
[num
], i
+ 3) << 4;
1427 bits
|= get_bit(scalars
[num
], i
+ 2) << 3;
1428 bits
|= get_bit(scalars
[num
], i
+ 1) << 2;
1429 bits
|= get_bit(scalars
[num
], i
) << 1;
1430 bits
|= get_bit(scalars
[num
], i
- 1);
1431 ossl_ec_GFp_nistp_recode_scalar_bits(&sign
, &digit
, bits
);
1434 * select the point to add or subtract, in constant time
1436 select_point(digit
, 17, pre_comp
[num
], tmp
);
1437 felem_neg(tmp
[3], tmp
[1]); /* (X, -Y, Z) is the negative
1439 copy_conditional(tmp
[1], tmp
[3], (-(limb
) sign
));
1442 point_add(nq
[0], nq
[1], nq
[2],
1443 nq
[0], nq
[1], nq
[2], mixed
,
1444 tmp
[0], tmp
[1], tmp
[2]);
1446 memcpy(nq
, tmp
, 3 * sizeof(felem
));
1452 felem_assign(x_out
, nq
[0]);
1453 felem_assign(y_out
, nq
[1]);
1454 felem_assign(z_out
, nq
[2]);
1457 /* Precomputation for the group generator. */
1458 struct nistp384_pre_comp_st
{
1459 felem g_pre_comp
[16][3];
1460 CRYPTO_REF_COUNT references
;
1463 const EC_METHOD
*ossl_ec_GFp_nistp384_method(void)
1465 static const EC_METHOD ret
= {
1466 EC_FLAGS_DEFAULT_OCT
,
1467 NID_X9_62_prime_field
,
1468 ossl_ec_GFp_nistp384_group_init
,
1469 ossl_ec_GFp_simple_group_finish
,
1470 ossl_ec_GFp_simple_group_clear_finish
,
1471 ossl_ec_GFp_nist_group_copy
,
1472 ossl_ec_GFp_nistp384_group_set_curve
,
1473 ossl_ec_GFp_simple_group_get_curve
,
1474 ossl_ec_GFp_simple_group_get_degree
,
1475 ossl_ec_group_simple_order_bits
,
1476 ossl_ec_GFp_simple_group_check_discriminant
,
1477 ossl_ec_GFp_simple_point_init
,
1478 ossl_ec_GFp_simple_point_finish
,
1479 ossl_ec_GFp_simple_point_clear_finish
,
1480 ossl_ec_GFp_simple_point_copy
,
1481 ossl_ec_GFp_simple_point_set_to_infinity
,
1482 ossl_ec_GFp_simple_point_set_affine_coordinates
,
1483 ossl_ec_GFp_nistp384_point_get_affine_coordinates
,
1484 0, /* point_set_compressed_coordinates */
1487 ossl_ec_GFp_simple_add
,
1488 ossl_ec_GFp_simple_dbl
,
1489 ossl_ec_GFp_simple_invert
,
1490 ossl_ec_GFp_simple_is_at_infinity
,
1491 ossl_ec_GFp_simple_is_on_curve
,
1492 ossl_ec_GFp_simple_cmp
,
1493 ossl_ec_GFp_simple_make_affine
,
1494 ossl_ec_GFp_simple_points_make_affine
,
1495 ossl_ec_GFp_nistp384_points_mul
,
1496 ossl_ec_GFp_nistp384_precompute_mult
,
1497 ossl_ec_GFp_nistp384_have_precompute_mult
,
1498 ossl_ec_GFp_nist_field_mul
,
1499 ossl_ec_GFp_nist_field_sqr
,
1501 ossl_ec_GFp_simple_field_inv
,
1502 0, /* field_encode */
1503 0, /* field_decode */
1504 0, /* field_set_to_one */
1505 ossl_ec_key_simple_priv2oct
,
1506 ossl_ec_key_simple_oct2priv
,
1507 0, /* set private */
1508 ossl_ec_key_simple_generate_key
,
1509 ossl_ec_key_simple_check_key
,
1510 ossl_ec_key_simple_generate_public_key
,
1513 ossl_ecdh_simple_compute_key
,
1514 ossl_ecdsa_simple_sign_setup
,
1515 ossl_ecdsa_simple_sign_sig
,
1516 ossl_ecdsa_simple_verify_sig
,
1517 0, /* field_inverse_mod_ord */
1518 0, /* blind_coordinates */
1520 0, /* ladder_step */
1527 /******************************************************************************/
1529 * FUNCTIONS TO MANAGE PRECOMPUTATION
1532 static NISTP384_PRE_COMP
*nistp384_pre_comp_new(void)
1534 NISTP384_PRE_COMP
*ret
= OPENSSL_zalloc(sizeof(*ret
));
1539 if (!CRYPTO_NEW_REF(&ret
->references
, 1)) {
1546 NISTP384_PRE_COMP
*ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP
*p
)
1551 CRYPTO_UP_REF(&p
->references
, &i
);
1555 void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP
*p
)
1562 CRYPTO_DOWN_REF(&p
->references
, &i
);
1563 REF_PRINT_COUNT("ossl_ec_nistp384", p
);
1566 REF_ASSERT_ISNT(i
< 0);
1568 CRYPTO_FREE_REF(&p
->references
);
1572 /******************************************************************************/
1574 * OPENSSL EC_METHOD FUNCTIONS
1577 int ossl_ec_GFp_nistp384_group_init(EC_GROUP
*group
)
1581 ret
= ossl_ec_GFp_simple_group_init(group
);
1582 group
->a_is_minus3
= 1;
1586 int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP
*group
, const BIGNUM
*p
,
1587 const BIGNUM
*a
, const BIGNUM
*b
,
1591 BIGNUM
*curve_p
, *curve_a
, *curve_b
;
1593 BN_CTX
*new_ctx
= NULL
;
1596 ctx
= new_ctx
= BN_CTX_new();
1602 curve_p
= BN_CTX_get(ctx
);
1603 curve_a
= BN_CTX_get(ctx
);
1604 curve_b
= BN_CTX_get(ctx
);
1605 if (curve_b
== NULL
)
1607 BN_bin2bn(nistp384_curve_params
[0], sizeof(felem_bytearray
), curve_p
);
1608 BN_bin2bn(nistp384_curve_params
[1], sizeof(felem_bytearray
), curve_a
);
1609 BN_bin2bn(nistp384_curve_params
[2], sizeof(felem_bytearray
), curve_b
);
1610 if ((BN_cmp(curve_p
, p
)) || (BN_cmp(curve_a
, a
)) || (BN_cmp(curve_b
, b
))) {
1611 ERR_raise(ERR_LIB_EC
, EC_R_WRONG_CURVE_PARAMETERS
);
1614 group
->field_mod_func
= BN_nist_mod_384
;
1615 ret
= ossl_ec_GFp_simple_group_set_curve(group
, p
, a
, b
, ctx
);
1619 BN_CTX_free(new_ctx
);
1625 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1628 int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP
*group
,
1629 const EC_POINT
*point
,
1630 BIGNUM
*x
, BIGNUM
*y
,
1633 felem z1
, z2
, x_in
, y_in
, x_out
, y_out
;
1636 if (EC_POINT_is_at_infinity(group
, point
)) {
1637 ERR_raise(ERR_LIB_EC
, EC_R_POINT_AT_INFINITY
);
1640 if ((!BN_to_felem(x_in
, point
->X
)) || (!BN_to_felem(y_in
, point
->Y
)) ||
1641 (!BN_to_felem(z1
, point
->Z
)))
1644 felem_square(tmp
, z2
);
1645 felem_reduce(z1
, tmp
);
1646 felem_mul(tmp
, x_in
, z1
);
1647 felem_reduce(x_in
, tmp
);
1648 felem_contract(x_out
, x_in
);
1650 if (!felem_to_BN(x
, x_out
)) {
1651 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1655 felem_mul(tmp
, z1
, z2
);
1656 felem_reduce(z1
, tmp
);
1657 felem_mul(tmp
, y_in
, z1
);
1658 felem_reduce(y_in
, tmp
);
1659 felem_contract(y_out
, y_in
);
1661 if (!felem_to_BN(y
, y_out
)) {
1662 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1669 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1670 static void make_points_affine(size_t num
, felem points
[][3],
1674 * Runs in constant time, unless an input is the point at infinity (which
1675 * normally shouldn't happen).
1677 ossl_ec_GFp_nistp_points_make_affine_internal(num
,
1681 (void (*)(void *))felem_one
,
1683 (void (*)(void *, const void *))
1685 (void (*)(void *, const void *))
1686 felem_square_reduce
,
1687 (void (*)(void *, const void *, const void*))
1689 (void (*)(void *, const void *))
1691 (void (*)(void *, const void *))
1696 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1697 * values Result is stored in r (r can equal one of the inputs).
1699 int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP
*group
, EC_POINT
*r
,
1700 const BIGNUM
*scalar
, size_t num
,
1701 const EC_POINT
*points
[],
1702 const BIGNUM
*scalars
[], BN_CTX
*ctx
)
1707 BIGNUM
*x
, *y
, *z
, *tmp_scalar
;
1708 felem_bytearray g_secret
;
1709 felem_bytearray
*secrets
= NULL
;
1710 felem (*pre_comp
)[17][3] = NULL
;
1711 felem
*tmp_felems
= NULL
;
1714 int have_pre_comp
= 0;
1715 size_t num_points
= num
;
1716 felem x_in
, y_in
, z_in
, x_out
, y_out
, z_out
;
1717 NISTP384_PRE_COMP
*pre
= NULL
;
1718 felem(*g_pre_comp
)[3] = NULL
;
1719 EC_POINT
*generator
= NULL
;
1720 const EC_POINT
*p
= NULL
;
1721 const BIGNUM
*p_scalar
= NULL
;
1724 x
= BN_CTX_get(ctx
);
1725 y
= BN_CTX_get(ctx
);
1726 z
= BN_CTX_get(ctx
);
1727 tmp_scalar
= BN_CTX_get(ctx
);
1728 if (tmp_scalar
== NULL
)
1731 if (scalar
!= NULL
) {
1732 pre
= group
->pre_comp
.nistp384
;
1734 /* we have precomputation, try to use it */
1735 g_pre_comp
= &pre
->g_pre_comp
[0];
1737 /* try to use the standard precomputation */
1738 g_pre_comp
= (felem(*)[3]) gmul
;
1739 generator
= EC_POINT_new(group
);
1740 if (generator
== NULL
)
1742 /* get the generator from precomputation */
1743 if (!felem_to_BN(x
, g_pre_comp
[1][0]) ||
1744 !felem_to_BN(y
, g_pre_comp
[1][1]) ||
1745 !felem_to_BN(z
, g_pre_comp
[1][2])) {
1746 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1749 if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group
,
1753 if (0 == EC_POINT_cmp(group
, generator
, group
->generator
, ctx
))
1754 /* precomputation matches generator */
1758 * we don't have valid precomputation: treat the generator as a
1764 if (num_points
> 0) {
1765 if (num_points
>= 2) {
1767 * unless we precompute multiples for just one point, converting
1768 * those into affine form is time well spent
1772 secrets
= OPENSSL_zalloc(sizeof(*secrets
) * num_points
);
1773 pre_comp
= OPENSSL_zalloc(sizeof(*pre_comp
) * num_points
);
1776 OPENSSL_malloc(sizeof(*tmp_felems
) * (num_points
* 17 + 1));
1777 if ((secrets
== NULL
) || (pre_comp
== NULL
)
1778 || (mixed
&& (tmp_felems
== NULL
)))
1782 * we treat NULL scalars as 0, and NULL points as points at infinity,
1783 * i.e., they contribute nothing to the linear combination
1785 for (i
= 0; i
< num_points
; ++i
) {
1788 * we didn't have a valid precomputation, so we pick the
1791 p
= EC_GROUP_get0_generator(group
);
1794 /* the i^th point */
1796 p_scalar
= scalars
[i
];
1798 if (p_scalar
!= NULL
&& p
!= NULL
) {
1799 /* reduce scalar to 0 <= scalar < 2^384 */
1800 if ((BN_num_bits(p_scalar
) > 384)
1801 || (BN_is_negative(p_scalar
))) {
1803 * this is an unusual input, and we don't guarantee
1806 if (!BN_nnmod(tmp_scalar
, p_scalar
, group
->order
, ctx
)) {
1807 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1810 num_bytes
= BN_bn2lebinpad(tmp_scalar
,
1811 secrets
[i
], sizeof(secrets
[i
]));
1813 num_bytes
= BN_bn2lebinpad(p_scalar
,
1814 secrets
[i
], sizeof(secrets
[i
]));
1816 if (num_bytes
< 0) {
1817 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1820 /* precompute multiples */
1821 if ((!BN_to_felem(x_out
, p
->X
)) ||
1822 (!BN_to_felem(y_out
, p
->Y
)) ||
1823 (!BN_to_felem(z_out
, p
->Z
)))
1825 memcpy(pre_comp
[i
][1][0], x_out
, sizeof(felem
));
1826 memcpy(pre_comp
[i
][1][1], y_out
, sizeof(felem
));
1827 memcpy(pre_comp
[i
][1][2], z_out
, sizeof(felem
));
1828 for (j
= 2; j
<= 16; ++j
) {
1830 point_add(pre_comp
[i
][j
][0], pre_comp
[i
][j
][1], pre_comp
[i
][j
][2],
1831 pre_comp
[i
][1][0], pre_comp
[i
][1][1], pre_comp
[i
][1][2], 0,
1832 pre_comp
[i
][j
- 1][0], pre_comp
[i
][j
- 1][1], pre_comp
[i
][j
- 1][2]);
1834 point_double(pre_comp
[i
][j
][0], pre_comp
[i
][j
][1], pre_comp
[i
][j
][2],
1835 pre_comp
[i
][j
/ 2][0], pre_comp
[i
][j
/ 2][1], pre_comp
[i
][j
/ 2][2]);
1841 make_points_affine(num_points
* 17, pre_comp
[0], tmp_felems
);
1844 /* the scalar for the generator */
1845 if (scalar
!= NULL
&& have_pre_comp
) {
1846 memset(g_secret
, 0, sizeof(g_secret
));
1847 /* reduce scalar to 0 <= scalar < 2^384 */
1848 if ((BN_num_bits(scalar
) > 384) || (BN_is_negative(scalar
))) {
1850 * this is an unusual input, and we don't guarantee
1853 if (!BN_nnmod(tmp_scalar
, scalar
, group
->order
, ctx
)) {
1854 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1857 num_bytes
= BN_bn2lebinpad(tmp_scalar
, g_secret
, sizeof(g_secret
));
1859 num_bytes
= BN_bn2lebinpad(scalar
, g_secret
, sizeof(g_secret
));
1861 /* do the multiplication with generator precomputation */
1862 batch_mul(x_out
, y_out
, z_out
,
1863 (const felem_bytearray(*))secrets
, num_points
,
1865 mixed
, (const felem(*)[17][3])pre_comp
,
1866 (const felem(*)[3])g_pre_comp
);
1868 /* do the multiplication without generator precomputation */
1869 batch_mul(x_out
, y_out
, z_out
,
1870 (const felem_bytearray(*))secrets
, num_points
,
1871 NULL
, mixed
, (const felem(*)[17][3])pre_comp
, NULL
);
1873 /* reduce the output to its unique minimal representation */
1874 felem_contract(x_in
, x_out
);
1875 felem_contract(y_in
, y_out
);
1876 felem_contract(z_in
, z_out
);
1877 if ((!felem_to_BN(x
, x_in
)) || (!felem_to_BN(y
, y_in
)) ||
1878 (!felem_to_BN(z
, z_in
))) {
1879 ERR_raise(ERR_LIB_EC
, ERR_R_BN_LIB
);
1882 ret
= ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group
, r
, x
, y
, z
,
1887 EC_POINT_free(generator
);
1888 OPENSSL_free(secrets
);
1889 OPENSSL_free(pre_comp
);
1890 OPENSSL_free(tmp_felems
);
1894 int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP
*group
, BN_CTX
*ctx
)
1897 NISTP384_PRE_COMP
*pre
= NULL
;
1900 EC_POINT
*generator
= NULL
;
1901 felem tmp_felems
[16];
1903 BN_CTX
*new_ctx
= NULL
;
1906 /* throw away old precomputation */
1907 EC_pre_comp_free(group
);
1911 ctx
= new_ctx
= BN_CTX_new();
1917 x
= BN_CTX_get(ctx
);
1918 y
= BN_CTX_get(ctx
);
1921 /* get the generator */
1922 if (group
->generator
== NULL
)
1924 generator
= EC_POINT_new(group
);
1925 if (generator
== NULL
)
1927 BN_bin2bn(nistp384_curve_params
[3], sizeof(felem_bytearray
), x
);
1928 BN_bin2bn(nistp384_curve_params
[4], sizeof(felem_bytearray
), y
);
1929 if (!EC_POINT_set_affine_coordinates(group
, generator
, x
, y
, ctx
))
1931 if ((pre
= nistp384_pre_comp_new()) == NULL
)
1934 * if the generator is the standard one, use built-in precomputation
1936 if (0 == EC_POINT_cmp(group
, generator
, group
->generator
, ctx
)) {
1937 memcpy(pre
->g_pre_comp
, gmul
, sizeof(pre
->g_pre_comp
));
1940 if ((!BN_to_felem(pre
->g_pre_comp
[1][0], group
->generator
->X
)) ||
1941 (!BN_to_felem(pre
->g_pre_comp
[1][1], group
->generator
->Y
)) ||
1942 (!BN_to_felem(pre
->g_pre_comp
[1][2], group
->generator
->Z
)))
1944 /* compute 2^95*G, 2^190*G, 2^285*G */
1945 for (i
= 1; i
<= 4; i
<<= 1) {
1946 point_double(pre
->g_pre_comp
[2 * i
][0], pre
->g_pre_comp
[2 * i
][1], pre
->g_pre_comp
[2 * i
][2],
1947 pre
->g_pre_comp
[i
][0], pre
->g_pre_comp
[i
][1], pre
->g_pre_comp
[i
][2]);
1948 for (j
= 0; j
< 94; ++j
) {
1949 point_double(pre
->g_pre_comp
[2 * i
][0], pre
->g_pre_comp
[2 * i
][1], pre
->g_pre_comp
[2 * i
][2],
1950 pre
->g_pre_comp
[2 * i
][0], pre
->g_pre_comp
[2 * i
][1], pre
->g_pre_comp
[2 * i
][2]);
1953 /* g_pre_comp[0] is the point at infinity */
1954 memset(pre
->g_pre_comp
[0], 0, sizeof(pre
->g_pre_comp
[0]));
1955 /* the remaining multiples */
1956 /* 2^95*G + 2^190*G */
1957 point_add(pre
->g_pre_comp
[6][0], pre
->g_pre_comp
[6][1], pre
->g_pre_comp
[6][2],
1958 pre
->g_pre_comp
[4][0], pre
->g_pre_comp
[4][1], pre
->g_pre_comp
[4][2], 0,
1959 pre
->g_pre_comp
[2][0], pre
->g_pre_comp
[2][1], pre
->g_pre_comp
[2][2]);
1960 /* 2^95*G + 2^285*G */
1961 point_add(pre
->g_pre_comp
[10][0], pre
->g_pre_comp
[10][1], pre
->g_pre_comp
[10][2],
1962 pre
->g_pre_comp
[8][0], pre
->g_pre_comp
[8][1], pre
->g_pre_comp
[8][2], 0,
1963 pre
->g_pre_comp
[2][0], pre
->g_pre_comp
[2][1], pre
->g_pre_comp
[2][2]);
1964 /* 2^190*G + 2^285*G */
1965 point_add(pre
->g_pre_comp
[12][0], pre
->g_pre_comp
[12][1], pre
->g_pre_comp
[12][2],
1966 pre
->g_pre_comp
[8][0], pre
->g_pre_comp
[8][1], pre
->g_pre_comp
[8][2], 0,
1967 pre
->g_pre_comp
[4][0], pre
->g_pre_comp
[4][1], pre
->g_pre_comp
[4][2]);
1968 /* 2^95*G + 2^190*G + 2^285*G */
1969 point_add(pre
->g_pre_comp
[14][0], pre
->g_pre_comp
[14][1], pre
->g_pre_comp
[14][2],
1970 pre
->g_pre_comp
[12][0], pre
->g_pre_comp
[12][1], pre
->g_pre_comp
[12][2], 0,
1971 pre
->g_pre_comp
[2][0], pre
->g_pre_comp
[2][1], pre
->g_pre_comp
[2][2]);
1972 for (i
= 1; i
< 8; ++i
) {
1973 /* odd multiples: add G */
1974 point_add(pre
->g_pre_comp
[2 * i
+ 1][0], pre
->g_pre_comp
[2 * i
+ 1][1], pre
->g_pre_comp
[2 * i
+ 1][2],
1975 pre
->g_pre_comp
[2 * i
][0], pre
->g_pre_comp
[2 * i
][1], pre
->g_pre_comp
[2 * i
][2], 0,
1976 pre
->g_pre_comp
[1][0], pre
->g_pre_comp
[1][1], pre
->g_pre_comp
[1][2]);
1978 make_points_affine(15, &(pre
->g_pre_comp
[1]), tmp_felems
);
1981 SETPRECOMP(group
, nistp384
, pre
);
1986 EC_POINT_free(generator
);
1988 BN_CTX_free(new_ctx
);
1990 ossl_ec_nistp384_pre_comp_free(pre
);
1994 int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP
*group
)
1996 return HAVEPRECOMP(group
, nistp384
);