]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/ecp_nistp384.c
threads_pthread.c: change inline to ossl_inline
[thirdparty/openssl.git] / crypto / ec / ecp_nistp384.c
1 /*
2 * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 /* Copyright 2023 IBM Corp.
11 *
12 * Licensed under the Apache License, Version 2.0 (the "License");
13 *
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 * http://www.apache.org/licenses/LICENSE-2.0
18 *
19 * Unless required by applicable law or agreed to in writing, software
20 * distributed under the License is distributed on an "AS IS" BASIS,
21 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 * See the License for the specific language governing permissions and
23 * limitations under the License.
24 */
25
26 /*
27 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29 * acceleration of felem_{square,mul} as supported in these files.
30 */
31
32 #include <openssl/e_os2.h>
33
34 #include <string.h>
35 #include <openssl/err.h>
36 #include "ec_local.h"
37
38 #include "internal/numbers.h"
39
40 #ifndef INT128_MAX
41 # error "Your compiler doesn't appear to support 128-bit integer types"
42 #endif
43
44 typedef uint8_t u8;
45 typedef uint64_t u64;
46
47 /*
48 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49 * can serialize an element of this field into 48 bytes. We call this an
50 * felem_bytearray.
51 */
52
53 typedef u8 felem_bytearray[48];
54
55 /*
56 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57 * These values are big-endian.
58 */
59 static const felem_bytearray nistp384_curve_params[5] = {
60 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68 {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72 {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76 {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80 };
81
82 /*-
83 * The representation of field elements.
84 * ------------------------------------
85 *
86 * We represent field elements with seven values. These values are either 64 or
87 * 128 bits and the field element represented is:
88 * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p)
89 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90 * 56 bits apart, but are greater than 56 bits in length, the most significant
91 * bits of each limb overlap with the least significant bits of the next
92 *
93 * This representation is considered to be 'redundant' in the sense that
94 * intermediate values can each contain more than a 56-bit value in each limb.
95 * Reduction causes all but the final limb to be reduced to contain a value less
96 * than 2^56, with the final value represented allowed to be larger than 2^384,
97 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98 * reduced value must of course be congruent to the unreduced value.
99 *
100 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101 * 'widefelem', featuring enough bits to store the result of a multiplication
102 * and even some further arithmetic without need for immediate reduction.
103 */
104
105 #define NLIMBS 7
106
107 typedef uint64_t limb;
108 typedef uint128_t widelimb;
109 typedef limb limb_aX __attribute((__aligned__(1)));
110 typedef limb felem[NLIMBS];
111 typedef widelimb widefelem[2*NLIMBS-1];
112
113 static const limb bottom56bits = 0xffffffffffffff;
114
115 /* Helper functions (de)serialising reduced field elements in little endian */
116 static void bin48_to_felem(felem out, const u8 in[48])
117 {
118 memset(out, 0, 56);
119 out[0] = (*((limb *) & in[0])) & bottom56bits;
120 out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121 out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122 out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123 out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124 out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125 memmove(&out[6], &in[42], 6);
126 }
127
128 static void felem_to_bin48(u8 out[48], const felem in)
129 {
130 memset(out, 0, 48);
131 (*((limb *) & out[0])) |= (in[0] & bottom56bits);
132 (*((limb_aX *) & out[7])) |= (in[1] & bottom56bits);
133 (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134 (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135 (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136 (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137 memmove(&out[42], &in[6], 6);
138 }
139
140 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141 static int BN_to_felem(felem out, const BIGNUM *bn)
142 {
143 felem_bytearray b_out;
144 int num_bytes;
145
146 if (BN_is_negative(bn)) {
147 ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148 return 0;
149 }
150 num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151 if (num_bytes < 0) {
152 ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153 return 0;
154 }
155 bin48_to_felem(out, b_out);
156 return 1;
157 }
158
159 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161 {
162 felem_bytearray b_out;
163
164 felem_to_bin48(b_out, in);
165 return BN_lebin2bn(b_out, sizeof(b_out), out);
166 }
167
168 /*-
169 * Field operations
170 * ----------------
171 */
172
173 static void felem_one(felem out)
174 {
175 out[0] = 1;
176 memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177 }
178
179 static void felem_assign(felem out, const felem in)
180 {
181 memcpy(out, in, sizeof(felem));
182 }
183
184 /* felem_sum64 sets out = out + in. */
185 static void felem_sum64(felem out, const felem in)
186 {
187 unsigned int i;
188
189 for (i = 0; i < NLIMBS; i++)
190 out[i] += in[i];
191 }
192
193 /* felem_scalar sets out = in * scalar */
194 static void felem_scalar(felem out, const felem in, limb scalar)
195 {
196 unsigned int i;
197
198 for (i = 0; i < NLIMBS; i++)
199 out[i] = in[i] * scalar;
200 }
201
202 /* felem_scalar64 sets out = out * scalar */
203 static void felem_scalar64(felem out, limb scalar)
204 {
205 unsigned int i;
206
207 for (i = 0; i < NLIMBS; i++)
208 out[i] *= scalar;
209 }
210
211 /* felem_scalar128 sets out = out * scalar */
212 static void felem_scalar128(widefelem out, limb scalar)
213 {
214 unsigned int i;
215
216 for (i = 0; i < 2*NLIMBS-1; i++)
217 out[i] *= scalar;
218 }
219
220 /*-
221 * felem_neg sets |out| to |-in|
222 * On entry:
223 * in[i] < 2^60 - 2^29
224 * On exit:
225 * out[i] < 2^60
226 */
227 static void felem_neg(felem out, const felem in)
228 {
229 /*
230 * In order to prevent underflow, we add a multiple of p before subtracting.
231 * Use telescopic sums to represent 2^12 * p redundantly with each limb
232 * of the form 2^60 + ...
233 */
234 static const limb two60m52m4 = (((limb) 1) << 60)
235 - (((limb) 1) << 52)
236 - (((limb) 1) << 4);
237 static const limb two60p44m12 = (((limb) 1) << 60)
238 + (((limb) 1) << 44)
239 - (((limb) 1) << 12);
240 static const limb two60m28m4 = (((limb) 1) << 60)
241 - (((limb) 1) << 28)
242 - (((limb) 1) << 4);
243 static const limb two60m4 = (((limb) 1) << 60)
244 - (((limb) 1) << 4);
245
246 out[0] = two60p44m12 - in[0];
247 out[1] = two60m52m4 - in[1];
248 out[2] = two60m28m4 - in[2];
249 out[3] = two60m4 - in[3];
250 out[4] = two60m4 - in[4];
251 out[5] = two60m4 - in[5];
252 out[6] = two60m4 - in[6];
253 }
254
255 /*-
256 * felem_diff64 subtracts |in| from |out|
257 * On entry:
258 * in[i] < 2^60 - 2^52 - 2^4
259 * On exit:
260 * out[i] < out_orig[i] + 2^60 + 2^44
261 */
262 static void felem_diff64(felem out, const felem in)
263 {
264 /*
265 * In order to prevent underflow, we add a multiple of p before subtracting.
266 * Use telescopic sums to represent 2^12 * p redundantly with each limb
267 * of the form 2^60 + ...
268 */
269
270 static const limb two60m52m4 = (((limb) 1) << 60)
271 - (((limb) 1) << 52)
272 - (((limb) 1) << 4);
273 static const limb two60p44m12 = (((limb) 1) << 60)
274 + (((limb) 1) << 44)
275 - (((limb) 1) << 12);
276 static const limb two60m28m4 = (((limb) 1) << 60)
277 - (((limb) 1) << 28)
278 - (((limb) 1) << 4);
279 static const limb two60m4 = (((limb) 1) << 60)
280 - (((limb) 1) << 4);
281
282 out[0] += two60p44m12 - in[0];
283 out[1] += two60m52m4 - in[1];
284 out[2] += two60m28m4 - in[2];
285 out[3] += two60m4 - in[3];
286 out[4] += two60m4 - in[4];
287 out[5] += two60m4 - in[5];
288 out[6] += two60m4 - in[6];
289 }
290
291 /*
292 * in[i] < 2^63
293 * out[i] < out_orig[i] + 2^64 + 2^48
294 */
295 static void felem_diff_128_64(widefelem out, const felem in)
296 {
297 /*
298 * In order to prevent underflow, we add a multiple of p before subtracting.
299 * Use telescopic sums to represent 2^16 * p redundantly with each limb
300 * of the form 2^64 + ...
301 */
302
303 static const widelimb two64m56m8 = (((widelimb) 1) << 64)
304 - (((widelimb) 1) << 56)
305 - (((widelimb) 1) << 8);
306 static const widelimb two64m32m8 = (((widelimb) 1) << 64)
307 - (((widelimb) 1) << 32)
308 - (((widelimb) 1) << 8);
309 static const widelimb two64m8 = (((widelimb) 1) << 64)
310 - (((widelimb) 1) << 8);
311 static const widelimb two64p48m16 = (((widelimb) 1) << 64)
312 + (((widelimb) 1) << 48)
313 - (((widelimb) 1) << 16);
314 unsigned int i;
315
316 out[0] += two64p48m16;
317 out[1] += two64m56m8;
318 out[2] += two64m32m8;
319 out[3] += two64m8;
320 out[4] += two64m8;
321 out[5] += two64m8;
322 out[6] += two64m8;
323
324 for (i = 0; i < NLIMBS; i++)
325 out[i] -= in[i];
326 }
327
328 /*
329 * in[i] < 2^127 - 2^119 - 2^71
330 * out[i] < out_orig[i] + 2^127 + 2^111
331 */
332 static void felem_diff128(widefelem out, const widefelem in)
333 {
334 /*
335 * In order to prevent underflow, we add a multiple of p before subtracting.
336 * Use telescopic sums to represent 2^415 * p redundantly with each limb
337 * of the form 2^127 + ...
338 */
339
340 static const widelimb two127 = ((widelimb) 1) << 127;
341 static const widelimb two127m71 = (((widelimb) 1) << 127)
342 - (((widelimb) 1) << 71);
343 static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
344 + (((widelimb) 1) << 111)
345 - (((widelimb) 1) << 79)
346 - (((widelimb) 1) << 71);
347 static const widelimb two127m119m71 = (((widelimb) 1) << 127)
348 - (((widelimb) 1) << 119)
349 - (((widelimb) 1) << 71);
350 static const widelimb two127m95m71 = (((widelimb) 1) << 127)
351 - (((widelimb) 1) << 95)
352 - (((widelimb) 1) << 71);
353 unsigned int i;
354
355 out[0] += two127;
356 out[1] += two127m71;
357 out[2] += two127m71;
358 out[3] += two127m71;
359 out[4] += two127m71;
360 out[5] += two127m71;
361 out[6] += two127p111m79m71;
362 out[7] += two127m119m71;
363 out[8] += two127m95m71;
364 out[9] += two127m71;
365 out[10] += two127m71;
366 out[11] += two127m71;
367 out[12] += two127m71;
368
369 for (i = 0; i < 2*NLIMBS-1; i++)
370 out[i] -= in[i];
371 }
372
373 static void felem_square_ref(widefelem out, const felem in)
374 {
375 felem inx2;
376 felem_scalar(inx2, in, 2);
377
378 out[0] = ((uint128_t) in[0]) * in[0];
379
380 out[1] = ((uint128_t) in[0]) * inx2[1];
381
382 out[2] = ((uint128_t) in[0]) * inx2[2]
383 + ((uint128_t) in[1]) * in[1];
384
385 out[3] = ((uint128_t) in[0]) * inx2[3]
386 + ((uint128_t) in[1]) * inx2[2];
387
388 out[4] = ((uint128_t) in[0]) * inx2[4]
389 + ((uint128_t) in[1]) * inx2[3]
390 + ((uint128_t) in[2]) * in[2];
391
392 out[5] = ((uint128_t) in[0]) * inx2[5]
393 + ((uint128_t) in[1]) * inx2[4]
394 + ((uint128_t) in[2]) * inx2[3];
395
396 out[6] = ((uint128_t) in[0]) * inx2[6]
397 + ((uint128_t) in[1]) * inx2[5]
398 + ((uint128_t) in[2]) * inx2[4]
399 + ((uint128_t) in[3]) * in[3];
400
401 out[7] = ((uint128_t) in[1]) * inx2[6]
402 + ((uint128_t) in[2]) * inx2[5]
403 + ((uint128_t) in[3]) * inx2[4];
404
405 out[8] = ((uint128_t) in[2]) * inx2[6]
406 + ((uint128_t) in[3]) * inx2[5]
407 + ((uint128_t) in[4]) * in[4];
408
409 out[9] = ((uint128_t) in[3]) * inx2[6]
410 + ((uint128_t) in[4]) * inx2[5];
411
412 out[10] = ((uint128_t) in[4]) * inx2[6]
413 + ((uint128_t) in[5]) * in[5];
414
415 out[11] = ((uint128_t) in[5]) * inx2[6];
416
417 out[12] = ((uint128_t) in[6]) * in[6];
418 }
419
420 static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
421 {
422 out[0] = ((uint128_t) in1[0]) * in2[0];
423
424 out[1] = ((uint128_t) in1[0]) * in2[1]
425 + ((uint128_t) in1[1]) * in2[0];
426
427 out[2] = ((uint128_t) in1[0]) * in2[2]
428 + ((uint128_t) in1[1]) * in2[1]
429 + ((uint128_t) in1[2]) * in2[0];
430
431 out[3] = ((uint128_t) in1[0]) * in2[3]
432 + ((uint128_t) in1[1]) * in2[2]
433 + ((uint128_t) in1[2]) * in2[1]
434 + ((uint128_t) in1[3]) * in2[0];
435
436 out[4] = ((uint128_t) in1[0]) * in2[4]
437 + ((uint128_t) in1[1]) * in2[3]
438 + ((uint128_t) in1[2]) * in2[2]
439 + ((uint128_t) in1[3]) * in2[1]
440 + ((uint128_t) in1[4]) * in2[0];
441
442 out[5] = ((uint128_t) in1[0]) * in2[5]
443 + ((uint128_t) in1[1]) * in2[4]
444 + ((uint128_t) in1[2]) * in2[3]
445 + ((uint128_t) in1[3]) * in2[2]
446 + ((uint128_t) in1[4]) * in2[1]
447 + ((uint128_t) in1[5]) * in2[0];
448
449 out[6] = ((uint128_t) in1[0]) * in2[6]
450 + ((uint128_t) in1[1]) * in2[5]
451 + ((uint128_t) in1[2]) * in2[4]
452 + ((uint128_t) in1[3]) * in2[3]
453 + ((uint128_t) in1[4]) * in2[2]
454 + ((uint128_t) in1[5]) * in2[1]
455 + ((uint128_t) in1[6]) * in2[0];
456
457 out[7] = ((uint128_t) in1[1]) * in2[6]
458 + ((uint128_t) in1[2]) * in2[5]
459 + ((uint128_t) in1[3]) * in2[4]
460 + ((uint128_t) in1[4]) * in2[3]
461 + ((uint128_t) in1[5]) * in2[2]
462 + ((uint128_t) in1[6]) * in2[1];
463
464 out[8] = ((uint128_t) in1[2]) * in2[6]
465 + ((uint128_t) in1[3]) * in2[5]
466 + ((uint128_t) in1[4]) * in2[4]
467 + ((uint128_t) in1[5]) * in2[3]
468 + ((uint128_t) in1[6]) * in2[2];
469
470 out[9] = ((uint128_t) in1[3]) * in2[6]
471 + ((uint128_t) in1[4]) * in2[5]
472 + ((uint128_t) in1[5]) * in2[4]
473 + ((uint128_t) in1[6]) * in2[3];
474
475 out[10] = ((uint128_t) in1[4]) * in2[6]
476 + ((uint128_t) in1[5]) * in2[5]
477 + ((uint128_t) in1[6]) * in2[4];
478
479 out[11] = ((uint128_t) in1[5]) * in2[6]
480 + ((uint128_t) in1[6]) * in2[5];
481
482 out[12] = ((uint128_t) in1[6]) * in2[6];
483 }
484
485 /*-
486 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487 * in[i] < 2^128 - 2^125
488 * out[i] < 2^56 for i < 6,
489 * out[6] <= 2^48
490 *
491 * The technique in use here stems from the format of the prime modulus:
492 * P384 = 2^384 - delta
493 *
494 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496 * coefficients are still quite large, and so we repeatedly apply this
497 * technique on high-order bits in order to guarantee the desired bounds on
498 * the size of our output.
499 *
500 * The three phases of elimination are as follows:
501 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
502 * [2]: Y = 2^8 (acc[8] | acc[7])
503 * [3]: Y = 2^48 (acc[6] >> 48)
504 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
505 */
506 static void felem_reduce(felem out, const widefelem in)
507 {
508 /*
509 * In order to prevent underflow, we add a multiple of p before subtracting.
510 * Use telescopic sums to represent 2^76 * p redundantly with each limb
511 * of the form 2^124 + ...
512 */
513 static const widelimb two124m68 = (((widelimb) 1) << 124)
514 - (((widelimb) 1) << 68);
515 static const widelimb two124m116m68 = (((widelimb) 1) << 124)
516 - (((widelimb) 1) << 116)
517 - (((widelimb) 1) << 68);
518 static const widelimb two124p108m76 = (((widelimb) 1) << 124)
519 + (((widelimb) 1) << 108)
520 - (((widelimb) 1) << 76);
521 static const widelimb two124m92m68 = (((widelimb) 1) << 124)
522 - (((widelimb) 1) << 92)
523 - (((widelimb) 1) << 68);
524 widelimb temp, acc[9];
525 unsigned int i;
526
527 memcpy(acc, in, sizeof(widelimb) * 9);
528
529 acc[0] += two124p108m76;
530 acc[1] += two124m116m68;
531 acc[2] += two124m92m68;
532 acc[3] += two124m68;
533 acc[4] += two124m68;
534 acc[5] += two124m68;
535 acc[6] += two124m68;
536
537 /* [1]: Eliminate in[9], ..., in[12] */
538 acc[8] += in[12] >> 32;
539 acc[7] += (in[12] & 0xffffffff) << 24;
540 acc[7] += in[12] >> 8;
541 acc[6] += (in[12] & 0xff) << 48;
542 acc[6] -= in[12] >> 16;
543 acc[5] -= (in[12] & 0xffff) << 40;
544 acc[6] += in[12] >> 48;
545 acc[5] += (in[12] & 0xffffffffffff) << 8;
546
547 acc[7] += in[11] >> 32;
548 acc[6] += (in[11] & 0xffffffff) << 24;
549 acc[6] += in[11] >> 8;
550 acc[5] += (in[11] & 0xff) << 48;
551 acc[5] -= in[11] >> 16;
552 acc[4] -= (in[11] & 0xffff) << 40;
553 acc[5] += in[11] >> 48;
554 acc[4] += (in[11] & 0xffffffffffff) << 8;
555
556 acc[6] += in[10] >> 32;
557 acc[5] += (in[10] & 0xffffffff) << 24;
558 acc[5] += in[10] >> 8;
559 acc[4] += (in[10] & 0xff) << 48;
560 acc[4] -= in[10] >> 16;
561 acc[3] -= (in[10] & 0xffff) << 40;
562 acc[4] += in[10] >> 48;
563 acc[3] += (in[10] & 0xffffffffffff) << 8;
564
565 acc[5] += in[9] >> 32;
566 acc[4] += (in[9] & 0xffffffff) << 24;
567 acc[4] += in[9] >> 8;
568 acc[3] += (in[9] & 0xff) << 48;
569 acc[3] -= in[9] >> 16;
570 acc[2] -= (in[9] & 0xffff) << 40;
571 acc[3] += in[9] >> 48;
572 acc[2] += (in[9] & 0xffffffffffff) << 8;
573
574 /*
575 * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576 * well as the contributions made from eliminating higher limbs.
577 * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578 * acc[8] < in[8] + 2^96
579 */
580 acc[4] += acc[8] >> 32;
581 acc[3] += (acc[8] & 0xffffffff) << 24;
582 acc[3] += acc[8] >> 8;
583 acc[2] += (acc[8] & 0xff) << 48;
584 acc[2] -= acc[8] >> 16;
585 acc[1] -= (acc[8] & 0xffff) << 40;
586 acc[2] += acc[8] >> 48;
587 acc[1] += (acc[8] & 0xffffffffffff) << 8;
588
589 acc[3] += acc[7] >> 32;
590 acc[2] += (acc[7] & 0xffffffff) << 24;
591 acc[2] += acc[7] >> 8;
592 acc[1] += (acc[7] & 0xff) << 48;
593 acc[1] -= acc[7] >> 16;
594 acc[0] -= (acc[7] & 0xffff) << 40;
595 acc[1] += acc[7] >> 48;
596 acc[0] += (acc[7] & 0xffffffffffff) << 8;
597
598 /*-
599 * acc[k] < in[k] + 2^124 + 2^121
600 * < in[k] + 2^125
601 * < 2^128, for k <= 6
602 */
603
604 /*
605 * Carry 4 -> 5 -> 6
606 * This has the effect of ensuring that these more significant limbs
607 * will be small in value after eliminating high bits from acc[6].
608 */
609 acc[5] += acc[4] >> 56;
610 acc[4] &= 0x00ffffffffffffff;
611
612 acc[6] += acc[5] >> 56;
613 acc[5] &= 0x00ffffffffffffff;
614
615 /*-
616 * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
617 * < in[6] + 2^125
618 * < 2^128
619 */
620
621 /* [3]: Eliminate high bits of acc[6] */
622 temp = acc[6] >> 48;
623 acc[6] &= 0x0000ffffffffffff;
624
625 /* temp < 2^80 */
626
627 acc[3] += temp >> 40;
628 acc[2] += (temp & 0xffffffffff) << 16;
629 acc[2] += temp >> 16;
630 acc[1] += (temp & 0xffff) << 40;
631 acc[1] -= temp >> 24;
632 acc[0] -= (temp & 0xffffff) << 32;
633 acc[0] += temp;
634
635 /*-
636 * acc[k] < acc_old[k] + 2^64 + 2^56
637 * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
638 */
639
640 /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641 acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
642 acc[0] &= 0x00ffffffffffffff;
643
644 acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645 acc[1] &= 0x00ffffffffffffff;
646
647 acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648 acc[2] &= 0x00ffffffffffffff;
649
650 /*-
651 * acc[k] < acc_old[k] + 2^72 + 2^16
652 * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
653 * < in[k] + 2^125
654 * < 2^128 , k < 4
655 */
656
657 acc[4] += acc[3] >> 56; /*-
658 * acc[4] < acc_old[4] + 2^72 + 2^16
659 * < 2^72 + 2^56 + 2^16
660 */
661 acc[3] &= 0x00ffffffffffffff;
662
663 acc[5] += acc[4] >> 56; /*-
664 * acc[5] < acc_old[5] + 2^16 + 1
665 * < 2^56 + 2^16 + 1
666 */
667 acc[4] &= 0x00ffffffffffffff;
668
669 acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
670 acc[5] &= 0x00ffffffffffffff;
671
672 for (i = 0; i < NLIMBS; i++)
673 out[i] = acc[i];
674 }
675
676 #if defined(ECP_NISTP384_ASM)
677 static void felem_square_wrapper(widefelem out, const felem in);
678 static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
679
680 static void (*felem_square_p)(widefelem out, const felem in) =
681 felem_square_wrapper;
682 static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
683 felem_mul_wrapper;
684
685 void p384_felem_square(widefelem out, const felem in);
686 void p384_felem_mul(widefelem out, const felem in1, const felem in2);
687
688 # if defined(_ARCH_PPC64)
689 # include "crypto/ppc_arch.h"
690 # endif
691
692 static void felem_select(void)
693 {
694 # if defined(_ARCH_PPC64)
695 if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
696 felem_square_p = p384_felem_square;
697 felem_mul_p = p384_felem_mul;
698
699 return;
700 }
701 # endif
702
703 /* Default */
704 felem_square_p = felem_square_ref;
705 felem_mul_p = felem_mul_ref;
706 }
707
708 static void felem_square_wrapper(widefelem out, const felem in)
709 {
710 felem_select();
711 felem_square_p(out, in);
712 }
713
714 static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
715 {
716 felem_select();
717 felem_mul_p(out, in1, in2);
718 }
719
720 # define felem_square felem_square_p
721 # define felem_mul felem_mul_p
722 #else
723 # define felem_square felem_square_ref
724 # define felem_mul felem_mul_ref
725 #endif
726
727 static ossl_inline void felem_square_reduce(felem out, const felem in)
728 {
729 widefelem tmp;
730
731 felem_square(tmp, in);
732 felem_reduce(out, tmp);
733 }
734
735 static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
736 {
737 widefelem tmp;
738
739 felem_mul(tmp, in1, in2);
740 felem_reduce(out, tmp);
741 }
742
743 /*-
744 * felem_inv calculates |out| = |in|^{-1}
745 *
746 * Based on Fermat's Little Theorem:
747 * a^p = a (mod p)
748 * a^{p-1} = 1 (mod p)
749 * a^{p-2} = a^{-1} (mod p)
750 */
751 static void felem_inv(felem out, const felem in)
752 {
753 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
754 unsigned int i = 0;
755
756 felem_square_reduce(ftmp, in); /* 2^1 */
757 felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */
758 felem_assign(ftmp2, ftmp);
759
760 felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */
761 felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */
762 felem_assign(ftmp3, ftmp);
763
764 for (i = 0; i < 3; i++)
765 felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
766 felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767 felem_assign(ftmp4, ftmp);
768
769 for (i = 0; i < 6; i++)
770 felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
771 felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
772
773 for (i = 0; i < 3; i++)
774 felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
775 felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
776 felem_assign(ftmp5, ftmp);
777
778 for (i = 0; i < 15; i++)
779 felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
780 felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
781 felem_assign(ftmp6, ftmp);
782
783 for (i = 0; i < 30; i++)
784 felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
785 felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
786 felem_assign(ftmp4, ftmp);
787
788 for (i = 0; i < 60; i++)
789 felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
790 felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
791 felem_assign(ftmp4, ftmp);
792
793 for (i = 0; i < 120; i++)
794 felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */
795 felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
796
797 for (i = 0; i < 15; i++)
798 felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
799 felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
800
801 for (i = 0; i < 31; i++)
802 felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
803 felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
804
805 for (i = 0; i < 2; i++)
806 felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807 felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
808
809 for (i = 0; i < 94; i++)
810 felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811 felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
812
813 for (i = 0; i < 2; i++)
814 felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815 felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
816
817 memcpy(out, ftmp, sizeof(felem));
818 }
819
820 /*
821 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
822 * and 0 otherwise. We know that field elements are reduced to
823 * 0 < in < 2p, so we only need to check two cases:
824 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825 * in[k] < 2^56, k < 6
826 * in[6] <= 2^48
827 */
828 static limb felem_is_zero(const felem in)
829 {
830 limb zero, p384;
831
832 zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
833 zero = ((int64_t) (zero) - 1) >> 63;
834 p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
835 | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
836 | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
837 | (in[6] ^ 0xffffffffffff);
838 p384 = ((int64_t) (p384) - 1) >> 63;
839
840 return (zero | p384);
841 }
842
843 static int felem_is_zero_int(const void *in)
844 {
845 return (int)(felem_is_zero(in) & ((limb) 1));
846 }
847
848 /*-
849 * felem_contract converts |in| to its unique, minimal representation.
850 * Assume we've removed all redundant bits.
851 * On entry:
852 * in[k] < 2^56, k < 6
853 * in[6] <= 2^48
854 */
855 static void felem_contract(felem out, const felem in)
856 {
857 static const int64_t two56 = ((limb) 1) << 56;
858
859 /*
860 * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861 * Perform two successive, idempotent subtractions to reduce if |in| >= p.
862 */
863
864 int64_t tmp[NLIMBS], cond[5], a;
865 unsigned int i;
866
867 memcpy(tmp, in, sizeof(felem));
868
869 /* Case 1: a = 1 iff |in| >= 2^384 */
870 a = (in[6] >> 48);
871 tmp[0] += a;
872 tmp[0] -= a << 32;
873 tmp[1] += a << 40;
874 tmp[2] += a << 16;
875 tmp[6] &= 0x0000ffffffffffff;
876
877 /*
878 * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879 * non-zero, so we only need one step
880 */
881
882 a = tmp[0] >> 63;
883 tmp[0] += a & two56;
884 tmp[1] -= a & 1;
885
886 /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887 tmp[2] += tmp[1] >> 56;
888 tmp[1] &= 0x00ffffffffffffff;
889
890 tmp[3] += tmp[2] >> 56;
891 tmp[2] &= 0x00ffffffffffffff;
892
893 tmp[4] += tmp[3] >> 56;
894 tmp[3] &= 0x00ffffffffffffff;
895
896 tmp[5] += tmp[4] >> 56;
897 tmp[4] &= 0x00ffffffffffffff;
898
899 tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
900 tmp[5] &= 0x00ffffffffffffff;
901
902 /*
903 * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
904 */
905
906 /* 0 iff (2^129..2^383) are all one */
907 cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
908 /* 0 iff 2^128 bit is one */
909 cond[1] = (tmp[2] | ~0x00000000010000) + 1;
910 /* 0 iff (2^96..2^127) bits are all one */
911 cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
912 /* 0 iff (2^32..2^95) bits are all zero */
913 cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
914 /* 0 iff (2^0..2^31) bits are all one */
915 cond[4] = (tmp[0] | 0xffffff00000000) + 1;
916
917 /*
918 * In effect, invert our conditions, so that 0 values become all 1's,
919 * any non-zero value in the low-order 56 bits becomes all 0's
920 */
921 for (i = 0; i < 5; i++)
922 cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
923
924 /*
925 * The condition for determining whether in is greater than our
926 * prime is given by the following condition.
927 */
928
929 /* First subtract 2^384 - 2^129 cheaply */
930 a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
931 tmp[6] &= ~a;
932 tmp[5] &= ~a;
933 tmp[4] &= ~a;
934 tmp[3] &= ~a;
935 tmp[2] &= ~a | 0x0000000001ffff;
936
937 /*
938 * Subtract 2^128 - 2^96 by
939 * means of disjoint cases.
940 */
941
942 /* subtract 2^128 if that bit is present, and add 2^96 */
943 a = cond[0] & cond[1];
944 tmp[2] &= ~a | 0xfffffffffeffff;
945 tmp[1] += a & ((int64_t) 1 << 40);
946
947 /* otherwise, clear bits 2^127 .. 2^96 */
948 a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
949 tmp[2] &= ~a | 0xffffffffff0000;
950 tmp[1] &= ~a | 0x0000ffffffffff;
951
952 /* finally, subtract the last 2^32 - 1 */
953 a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
954 tmp[0] += a & (-((int64_t) 1 << 32) + 1);
955
956 /*
957 * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958 * non-zero, so we only need one step
959 */
960 a = tmp[0] >> 63;
961 tmp[0] += a & two56;
962 tmp[1] -= a & 1;
963
964 /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965 tmp[2] += tmp[1] >> 56;
966 tmp[1] &= 0x00ffffffffffffff;
967
968 tmp[3] += tmp[2] >> 56;
969 tmp[2] &= 0x00ffffffffffffff;
970
971 tmp[4] += tmp[3] >> 56;
972 tmp[3] &= 0x00ffffffffffffff;
973
974 tmp[5] += tmp[4] >> 56;
975 tmp[4] &= 0x00ffffffffffffff;
976
977 tmp[6] += tmp[5] >> 56;
978 tmp[5] &= 0x00ffffffffffffff;
979
980 memcpy(out, tmp, sizeof(felem));
981 }
982
983 /*-
984 * Group operations
985 * ----------------
986 *
987 * Building on top of the field operations we have the operations on the
988 * elliptic curve group itself. Points on the curve are represented in Jacobian
989 * coordinates
990 */
991
992 /*-
993 * point_double calculates 2*(x_in, y_in, z_in)
994 *
995 * The method is taken from:
996 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
997 *
998 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999 * while x_out == y_in is not (maybe this works, but it's not tested).
1000 */
1001 static void
1002 point_double(felem x_out, felem y_out, felem z_out,
1003 const felem x_in, const felem y_in, const felem z_in)
1004 {
1005 widefelem tmp, tmp2;
1006 felem delta, gamma, beta, alpha, ftmp, ftmp2;
1007
1008 felem_assign(ftmp, x_in);
1009 felem_assign(ftmp2, x_in);
1010
1011 /* delta = z^2 */
1012 felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */
1013
1014 /* gamma = y^2 */
1015 felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */
1016
1017 /* beta = x*gamma */
1018 felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */
1019
1020 /* alpha = 3*(x-delta)*(x+delta) */
1021 felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022 felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */
1023 felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */
1024 felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1025
1026 /* x' = alpha^2 - 8*beta */
1027 felem_square(tmp, alpha); /* tmp[i] < 2^115 */
1028 felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */
1029 felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */
1030 felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031 felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1032
1033 /* z' = (y + z)^2 - gamma - delta */
1034 felem_sum64(delta, gamma); /* delta[i] < 2^57 */
1035 felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */
1036 felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */
1037 felem_square(tmp, ftmp); /* tmp[i] < 2^115 */
1038 felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039 felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */
1040
1041 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1042 felem_scalar64(beta, 4); /* beta[i] < 2^58 */
1043 felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */
1044 felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */
1045 felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */
1046 felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */
1047 felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048 felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */
1049 }
1050
1051 /* copy_conditional copies in to out iff mask is all ones. */
1052 static void copy_conditional(felem out, const felem in, limb mask)
1053 {
1054 unsigned int i;
1055
1056 for (i = 0; i < NLIMBS; i++)
1057 out[i] ^= mask & (in[i] ^ out[i]);
1058 }
1059
1060 /*-
1061 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1062 *
1063 * The method is taken from
1064 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1066 *
1067 * This function includes a branch for checking whether the two input points
1068 * are equal (while not equal to the point at infinity). See comment below
1069 * on constant-time.
1070 */
1071 static void point_add(felem x3, felem y3, felem z3,
1072 const felem x1, const felem y1, const felem z1,
1073 const int mixed, const felem x2, const felem y2,
1074 const felem z2)
1075 {
1076 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1077 widefelem tmp, tmp2;
1078 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1079 limb points_equal;
1080
1081 z1_is_zero = felem_is_zero(z1);
1082 z2_is_zero = felem_is_zero(z2);
1083
1084 /* ftmp = z1z1 = z1**2 */
1085 felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */
1086
1087 if (!mixed) {
1088 /* ftmp2 = z2z2 = z2**2 */
1089 felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1090
1091 /* u1 = ftmp3 = x1*z2z2 */
1092 felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1093
1094 /* ftmp5 = z1 + z2 */
1095 felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */
1096 felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */
1097
1098 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1099 felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1100 felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101 felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102 felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1103
1104 /* ftmp2 = z2 * z2z2 */
1105 felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1106
1107 /* s1 = ftmp6 = y1 * z2**3 */
1108 felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1109 } else {
1110 /*
1111 * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1112 */
1113
1114 /* u1 = ftmp3 = x1*z2z2 */
1115 felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */
1116
1117 /* ftmp5 = 2*z1z2 */
1118 felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */
1119
1120 /* s1 = ftmp6 = y1 * z2**3 */
1121 felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */
1122 }
1123 /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1124
1125 /* u2 = x2*z1z1 */
1126 felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */
1127
1128 /* h = ftmp4 = u2 - u1 */
1129 felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130 felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */
1131
1132 x_equal = felem_is_zero(ftmp4);
1133
1134 /* z_out = ftmp5 * h */
1135 felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */
1136
1137 /* ftmp = z1 * z1z1 */
1138 felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */
1139
1140 /* s2 = tmp = y2 * z1**3 */
1141 felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */
1142
1143 /* r = ftmp5 = (s2 - s1)*2 */
1144 felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145 felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1146 y_equal = felem_is_zero(ftmp5);
1147 felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */
1148
1149 /*
1150 * The formulae are incorrect if the points are equal, in affine coordinates
1151 * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1152 * happens.
1153 *
1154 * We use bitwise operations to avoid potential side-channels introduced by
1155 * the short-circuiting behaviour of boolean operators.
1156 *
1157 * The special case of either point being the point at infinity (z1 and/or
1158 * z2 are zero), is handled separately later on in this function, so we
1159 * avoid jumping to point_double here in those special cases.
1160 *
1161 * Notice the comment below on the implications of this branching for timing
1162 * leaks and why it is considered practically irrelevant.
1163 */
1164 points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1165
1166 if (points_equal) {
1167 /*
1168 * This is obviously not constant-time but it will almost-never happen
1169 * for ECDH / ECDSA.
1170 */
1171 point_double(x3, y3, z3, x1, y1, z1);
1172 return;
1173 }
1174
1175 /* I = ftmp = (2h)**2 */
1176 felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */
1177 felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */
1178 felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */
1179
1180 /* J = ftmp2 = h * I */
1181 felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1182
1183 /* V = ftmp4 = U1 * I */
1184 felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1185
1186 /* x_out = r**2 - J - 2V */
1187 felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1188 felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189 felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */
1190 felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */
1191 felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192 felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1193
1194 /* y_out = r(V-x_out) - 2 * s1 * J */
1195 felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196 felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */
1197 felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1198 felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */
1199 felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200 felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */
1201
1202 copy_conditional(x_out, x2, z1_is_zero);
1203 copy_conditional(x_out, x1, z2_is_zero);
1204 copy_conditional(y_out, y2, z1_is_zero);
1205 copy_conditional(y_out, y1, z2_is_zero);
1206 copy_conditional(z_out, z2, z1_is_zero);
1207 copy_conditional(z_out, z1, z2_is_zero);
1208 felem_assign(x3, x_out);
1209 felem_assign(y3, y_out);
1210 felem_assign(z3, z_out);
1211 }
1212
1213 /*-
1214 * Base point pre computation
1215 * --------------------------
1216 *
1217 * Two different sorts of precomputed tables are used in the following code.
1218 * Each contain various points on the curve, where each point is three field
1219 * elements (x, y, z).
1220 *
1221 * For the base point table, z is usually 1 (0 for the point at infinity).
1222 * This table has 16 elements:
1223 * index | bits | point
1224 * ------+---------+------------------------------
1225 * 0 | 0 0 0 0 | 0G
1226 * 1 | 0 0 0 1 | 1G
1227 * 2 | 0 0 1 0 | 2^95G
1228 * 3 | 0 0 1 1 | (2^95 + 1)G
1229 * 4 | 0 1 0 0 | 2^190G
1230 * 5 | 0 1 0 1 | (2^190 + 1)G
1231 * 6 | 0 1 1 0 | (2^190 + 2^95)G
1232 * 7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1233 * 8 | 1 0 0 0 | 2^285G
1234 * 9 | 1 0 0 1 | (2^285 + 1)G
1235 * 10 | 1 0 1 0 | (2^285 + 2^95)G
1236 * 11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1237 * 12 | 1 1 0 0 | (2^285 + 2^190)G
1238 * 13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1239 * 14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1240 * 15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1241 *
1242 * The reason for this is so that we can clock bits into four different
1243 * locations when doing simple scalar multiplies against the base point.
1244 *
1245 * Tables for other points have table[i] = iG for i in 0 .. 16.
1246 */
1247
1248 /* gmul is the table of precomputed base points */
1249 static const felem gmul[16][3] = {
1250 {{0, 0, 0, 0, 0, 0, 0},
1251 {0, 0, 0, 0, 0, 0, 0},
1252 {0, 0, 0, 0, 0, 0, 0}},
1253 {{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254 0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256 0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257 {1, 0, 0, 0, 0, 0, 0}},
1258 {{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259 0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261 0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262 {1, 0, 0, 0, 0, 0, 0}},
1263 {{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264 0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266 0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267 {1, 0, 0, 0, 0, 0, 0}},
1268 {{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269 0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271 0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272 {1, 0, 0, 0, 0, 0, 0}},
1273 {{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274 0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276 0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277 {1, 0, 0, 0, 0, 0, 0}},
1278 {{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279 0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281 0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282 {1, 0, 0, 0, 0, 0, 0}},
1283 {{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284 0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286 0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287 {1, 0, 0, 0, 0, 0, 0}},
1288 {{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289 0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291 0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292 {1, 0, 0, 0, 0, 0, 0}},
1293 {{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294 0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296 0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297 {1, 0, 0, 0, 0, 0, 0}},
1298 {{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299 0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301 0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302 {1, 0, 0, 0, 0, 0, 0}},
1303 {{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304 0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306 0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307 {1, 0, 0, 0, 0, 0, 0}},
1308 {{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309 0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311 0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312 {1, 0, 0, 0, 0, 0, 0}},
1313 {{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314 0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316 0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317 {1, 0, 0, 0, 0, 0, 0}},
1318 {{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319 0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321 0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322 {1, 0, 0, 0, 0, 0, 0}},
1323 {{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324 0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326 0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327 {1, 0, 0, 0, 0, 0, 0}}
1328 };
1329
1330 /*
1331 * select_point selects the |idx|th point from a precomputation table and
1332 * copies it to out.
1333 *
1334 * pre_comp below is of the size provided in |size|.
1335 */
1336 static void select_point(const limb idx, unsigned int size,
1337 const felem pre_comp[][3], felem out[3])
1338 {
1339 unsigned int i, j;
1340 limb *outlimbs = &out[0][0];
1341
1342 memset(out, 0, sizeof(*out) * 3);
1343
1344 for (i = 0; i < size; i++) {
1345 const limb *inlimbs = &pre_comp[i][0][0];
1346 limb mask = i ^ idx;
1347
1348 mask |= mask >> 4;
1349 mask |= mask >> 2;
1350 mask |= mask >> 1;
1351 mask &= 1;
1352 mask--;
1353 for (j = 0; j < NLIMBS * 3; j++)
1354 outlimbs[j] |= inlimbs[j] & mask;
1355 }
1356 }
1357
1358 /* get_bit returns the |i|th bit in |in| */
1359 static char get_bit(const felem_bytearray in, int i)
1360 {
1361 if (i < 0 || i >= 384)
1362 return 0;
1363 return (in[i >> 3] >> (i & 7)) & 1;
1364 }
1365
1366 /*
1367 * Interleaved point multiplication using precomputed point multiples: The
1368 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1369 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370 * generator, using certain (large) precomputed multiples in g_pre_comp.
1371 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1372 */
1373 static void batch_mul(felem x_out, felem y_out, felem z_out,
1374 const felem_bytearray scalars[],
1375 const unsigned int num_points, const u8 *g_scalar,
1376 const int mixed, const felem pre_comp[][17][3],
1377 const felem g_pre_comp[16][3])
1378 {
1379 int i, skip;
1380 unsigned int num, gen_mul = (g_scalar != NULL);
1381 felem nq[3], tmp[4];
1382 limb bits;
1383 u8 sign, digit;
1384
1385 /* set nq to the point at infinity */
1386 memset(nq, 0, sizeof(nq));
1387
1388 /*
1389 * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390 * of the generator (last quarter of rounds) and additions of other
1391 * points multiples (every 5th round).
1392 */
1393 skip = 1; /* save two point operations in the first
1394 * round */
1395 for (i = (num_points ? 380 : 98); i >= 0; --i) {
1396 /* double */
1397 if (!skip)
1398 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1399
1400 /* add multiples of the generator */
1401 if (gen_mul && (i <= 98)) {
1402 bits = get_bit(g_scalar, i + 285) << 3;
1403 if (i < 95) {
1404 bits |= get_bit(g_scalar, i + 190) << 2;
1405 bits |= get_bit(g_scalar, i + 95) << 1;
1406 bits |= get_bit(g_scalar, i);
1407 }
1408 /* select the point to add, in constant time */
1409 select_point(bits, 16, g_pre_comp, tmp);
1410 if (!skip) {
1411 /* The 1 argument below is for "mixed" */
1412 point_add(nq[0], nq[1], nq[2],
1413 nq[0], nq[1], nq[2], 1,
1414 tmp[0], tmp[1], tmp[2]);
1415 } else {
1416 memcpy(nq, tmp, 3 * sizeof(felem));
1417 skip = 0;
1418 }
1419 }
1420
1421 /* do other additions every 5 doublings */
1422 if (num_points && (i % 5 == 0)) {
1423 /* loop over all scalars */
1424 for (num = 0; num < num_points; ++num) {
1425 bits = get_bit(scalars[num], i + 4) << 5;
1426 bits |= get_bit(scalars[num], i + 3) << 4;
1427 bits |= get_bit(scalars[num], i + 2) << 3;
1428 bits |= get_bit(scalars[num], i + 1) << 2;
1429 bits |= get_bit(scalars[num], i) << 1;
1430 bits |= get_bit(scalars[num], i - 1);
1431 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1432
1433 /*
1434 * select the point to add or subtract, in constant time
1435 */
1436 select_point(digit, 17, pre_comp[num], tmp);
1437 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1438 * point */
1439 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1440
1441 if (!skip) {
1442 point_add(nq[0], nq[1], nq[2],
1443 nq[0], nq[1], nq[2], mixed,
1444 tmp[0], tmp[1], tmp[2]);
1445 } else {
1446 memcpy(nq, tmp, 3 * sizeof(felem));
1447 skip = 0;
1448 }
1449 }
1450 }
1451 }
1452 felem_assign(x_out, nq[0]);
1453 felem_assign(y_out, nq[1]);
1454 felem_assign(z_out, nq[2]);
1455 }
1456
1457 /* Precomputation for the group generator. */
1458 struct nistp384_pre_comp_st {
1459 felem g_pre_comp[16][3];
1460 CRYPTO_REF_COUNT references;
1461 };
1462
1463 const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1464 {
1465 static const EC_METHOD ret = {
1466 EC_FLAGS_DEFAULT_OCT,
1467 NID_X9_62_prime_field,
1468 ossl_ec_GFp_nistp384_group_init,
1469 ossl_ec_GFp_simple_group_finish,
1470 ossl_ec_GFp_simple_group_clear_finish,
1471 ossl_ec_GFp_nist_group_copy,
1472 ossl_ec_GFp_nistp384_group_set_curve,
1473 ossl_ec_GFp_simple_group_get_curve,
1474 ossl_ec_GFp_simple_group_get_degree,
1475 ossl_ec_group_simple_order_bits,
1476 ossl_ec_GFp_simple_group_check_discriminant,
1477 ossl_ec_GFp_simple_point_init,
1478 ossl_ec_GFp_simple_point_finish,
1479 ossl_ec_GFp_simple_point_clear_finish,
1480 ossl_ec_GFp_simple_point_copy,
1481 ossl_ec_GFp_simple_point_set_to_infinity,
1482 ossl_ec_GFp_simple_point_set_affine_coordinates,
1483 ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1484 0, /* point_set_compressed_coordinates */
1485 0, /* point2oct */
1486 0, /* oct2point */
1487 ossl_ec_GFp_simple_add,
1488 ossl_ec_GFp_simple_dbl,
1489 ossl_ec_GFp_simple_invert,
1490 ossl_ec_GFp_simple_is_at_infinity,
1491 ossl_ec_GFp_simple_is_on_curve,
1492 ossl_ec_GFp_simple_cmp,
1493 ossl_ec_GFp_simple_make_affine,
1494 ossl_ec_GFp_simple_points_make_affine,
1495 ossl_ec_GFp_nistp384_points_mul,
1496 ossl_ec_GFp_nistp384_precompute_mult,
1497 ossl_ec_GFp_nistp384_have_precompute_mult,
1498 ossl_ec_GFp_nist_field_mul,
1499 ossl_ec_GFp_nist_field_sqr,
1500 0, /* field_div */
1501 ossl_ec_GFp_simple_field_inv,
1502 0, /* field_encode */
1503 0, /* field_decode */
1504 0, /* field_set_to_one */
1505 ossl_ec_key_simple_priv2oct,
1506 ossl_ec_key_simple_oct2priv,
1507 0, /* set private */
1508 ossl_ec_key_simple_generate_key,
1509 ossl_ec_key_simple_check_key,
1510 ossl_ec_key_simple_generate_public_key,
1511 0, /* keycopy */
1512 0, /* keyfinish */
1513 ossl_ecdh_simple_compute_key,
1514 ossl_ecdsa_simple_sign_setup,
1515 ossl_ecdsa_simple_sign_sig,
1516 ossl_ecdsa_simple_verify_sig,
1517 0, /* field_inverse_mod_ord */
1518 0, /* blind_coordinates */
1519 0, /* ladder_pre */
1520 0, /* ladder_step */
1521 0 /* ladder_post */
1522 };
1523
1524 return &ret;
1525 }
1526
1527 /******************************************************************************/
1528 /*
1529 * FUNCTIONS TO MANAGE PRECOMPUTATION
1530 */
1531
1532 static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1533 {
1534 NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1535
1536 if (ret == NULL)
1537 return ret;
1538
1539 if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1540 OPENSSL_free(ret);
1541 return NULL;
1542 }
1543 return ret;
1544 }
1545
1546 NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1547 {
1548 int i;
1549
1550 if (p != NULL)
1551 CRYPTO_UP_REF(&p->references, &i);
1552 return p;
1553 }
1554
1555 void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1556 {
1557 int i;
1558
1559 if (p == NULL)
1560 return;
1561
1562 CRYPTO_DOWN_REF(&p->references, &i);
1563 REF_PRINT_COUNT("ossl_ec_nistp384", p);
1564 if (i > 0)
1565 return;
1566 REF_ASSERT_ISNT(i < 0);
1567
1568 CRYPTO_FREE_REF(&p->references);
1569 OPENSSL_free(p);
1570 }
1571
1572 /******************************************************************************/
1573 /*
1574 * OPENSSL EC_METHOD FUNCTIONS
1575 */
1576
1577 int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1578 {
1579 int ret;
1580
1581 ret = ossl_ec_GFp_simple_group_init(group);
1582 group->a_is_minus3 = 1;
1583 return ret;
1584 }
1585
1586 int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1587 const BIGNUM *a, const BIGNUM *b,
1588 BN_CTX *ctx)
1589 {
1590 int ret = 0;
1591 BIGNUM *curve_p, *curve_a, *curve_b;
1592 #ifndef FIPS_MODULE
1593 BN_CTX *new_ctx = NULL;
1594
1595 if (ctx == NULL)
1596 ctx = new_ctx = BN_CTX_new();
1597 #endif
1598 if (ctx == NULL)
1599 return 0;
1600
1601 BN_CTX_start(ctx);
1602 curve_p = BN_CTX_get(ctx);
1603 curve_a = BN_CTX_get(ctx);
1604 curve_b = BN_CTX_get(ctx);
1605 if (curve_b == NULL)
1606 goto err;
1607 BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1608 BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1609 BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1610 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1611 ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1612 goto err;
1613 }
1614 group->field_mod_func = BN_nist_mod_384;
1615 ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1616 err:
1617 BN_CTX_end(ctx);
1618 #ifndef FIPS_MODULE
1619 BN_CTX_free(new_ctx);
1620 #endif
1621 return ret;
1622 }
1623
1624 /*
1625 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1626 * (X/Z^2, Y/Z^3)
1627 */
1628 int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1629 const EC_POINT *point,
1630 BIGNUM *x, BIGNUM *y,
1631 BN_CTX *ctx)
1632 {
1633 felem z1, z2, x_in, y_in, x_out, y_out;
1634 widefelem tmp;
1635
1636 if (EC_POINT_is_at_infinity(group, point)) {
1637 ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1638 return 0;
1639 }
1640 if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1641 (!BN_to_felem(z1, point->Z)))
1642 return 0;
1643 felem_inv(z2, z1);
1644 felem_square(tmp, z2);
1645 felem_reduce(z1, tmp);
1646 felem_mul(tmp, x_in, z1);
1647 felem_reduce(x_in, tmp);
1648 felem_contract(x_out, x_in);
1649 if (x != NULL) {
1650 if (!felem_to_BN(x, x_out)) {
1651 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1652 return 0;
1653 }
1654 }
1655 felem_mul(tmp, z1, z2);
1656 felem_reduce(z1, tmp);
1657 felem_mul(tmp, y_in, z1);
1658 felem_reduce(y_in, tmp);
1659 felem_contract(y_out, y_in);
1660 if (y != NULL) {
1661 if (!felem_to_BN(y, y_out)) {
1662 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1663 return 0;
1664 }
1665 }
1666 return 1;
1667 }
1668
1669 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1670 static void make_points_affine(size_t num, felem points[][3],
1671 felem tmp_felems[])
1672 {
1673 /*
1674 * Runs in constant time, unless an input is the point at infinity (which
1675 * normally shouldn't happen).
1676 */
1677 ossl_ec_GFp_nistp_points_make_affine_internal(num,
1678 points,
1679 sizeof(felem),
1680 tmp_felems,
1681 (void (*)(void *))felem_one,
1682 felem_is_zero_int,
1683 (void (*)(void *, const void *))
1684 felem_assign,
1685 (void (*)(void *, const void *))
1686 felem_square_reduce,
1687 (void (*)(void *, const void *, const void*))
1688 felem_mul_reduce,
1689 (void (*)(void *, const void *))
1690 felem_inv,
1691 (void (*)(void *, const void *))
1692 felem_contract);
1693 }
1694
1695 /*
1696 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1697 * values Result is stored in r (r can equal one of the inputs).
1698 */
1699 int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1700 const BIGNUM *scalar, size_t num,
1701 const EC_POINT *points[],
1702 const BIGNUM *scalars[], BN_CTX *ctx)
1703 {
1704 int ret = 0;
1705 int j;
1706 int mixed = 0;
1707 BIGNUM *x, *y, *z, *tmp_scalar;
1708 felem_bytearray g_secret;
1709 felem_bytearray *secrets = NULL;
1710 felem (*pre_comp)[17][3] = NULL;
1711 felem *tmp_felems = NULL;
1712 unsigned int i;
1713 int num_bytes;
1714 int have_pre_comp = 0;
1715 size_t num_points = num;
1716 felem x_in, y_in, z_in, x_out, y_out, z_out;
1717 NISTP384_PRE_COMP *pre = NULL;
1718 felem(*g_pre_comp)[3] = NULL;
1719 EC_POINT *generator = NULL;
1720 const EC_POINT *p = NULL;
1721 const BIGNUM *p_scalar = NULL;
1722
1723 BN_CTX_start(ctx);
1724 x = BN_CTX_get(ctx);
1725 y = BN_CTX_get(ctx);
1726 z = BN_CTX_get(ctx);
1727 tmp_scalar = BN_CTX_get(ctx);
1728 if (tmp_scalar == NULL)
1729 goto err;
1730
1731 if (scalar != NULL) {
1732 pre = group->pre_comp.nistp384;
1733 if (pre)
1734 /* we have precomputation, try to use it */
1735 g_pre_comp = &pre->g_pre_comp[0];
1736 else
1737 /* try to use the standard precomputation */
1738 g_pre_comp = (felem(*)[3]) gmul;
1739 generator = EC_POINT_new(group);
1740 if (generator == NULL)
1741 goto err;
1742 /* get the generator from precomputation */
1743 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1744 !felem_to_BN(y, g_pre_comp[1][1]) ||
1745 !felem_to_BN(z, g_pre_comp[1][2])) {
1746 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1747 goto err;
1748 }
1749 if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1750 generator,
1751 x, y, z, ctx))
1752 goto err;
1753 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1754 /* precomputation matches generator */
1755 have_pre_comp = 1;
1756 else
1757 /*
1758 * we don't have valid precomputation: treat the generator as a
1759 * random point
1760 */
1761 num_points++;
1762 }
1763
1764 if (num_points > 0) {
1765 if (num_points >= 2) {
1766 /*
1767 * unless we precompute multiples for just one point, converting
1768 * those into affine form is time well spent
1769 */
1770 mixed = 1;
1771 }
1772 secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1773 pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1774 if (mixed)
1775 tmp_felems =
1776 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1777 if ((secrets == NULL) || (pre_comp == NULL)
1778 || (mixed && (tmp_felems == NULL)))
1779 goto err;
1780
1781 /*
1782 * we treat NULL scalars as 0, and NULL points as points at infinity,
1783 * i.e., they contribute nothing to the linear combination
1784 */
1785 for (i = 0; i < num_points; ++i) {
1786 if (i == num) {
1787 /*
1788 * we didn't have a valid precomputation, so we pick the
1789 * generator
1790 */
1791 p = EC_GROUP_get0_generator(group);
1792 p_scalar = scalar;
1793 } else {
1794 /* the i^th point */
1795 p = points[i];
1796 p_scalar = scalars[i];
1797 }
1798 if (p_scalar != NULL && p != NULL) {
1799 /* reduce scalar to 0 <= scalar < 2^384 */
1800 if ((BN_num_bits(p_scalar) > 384)
1801 || (BN_is_negative(p_scalar))) {
1802 /*
1803 * this is an unusual input, and we don't guarantee
1804 * constant-timeness
1805 */
1806 if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1807 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1808 goto err;
1809 }
1810 num_bytes = BN_bn2lebinpad(tmp_scalar,
1811 secrets[i], sizeof(secrets[i]));
1812 } else {
1813 num_bytes = BN_bn2lebinpad(p_scalar,
1814 secrets[i], sizeof(secrets[i]));
1815 }
1816 if (num_bytes < 0) {
1817 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1818 goto err;
1819 }
1820 /* precompute multiples */
1821 if ((!BN_to_felem(x_out, p->X)) ||
1822 (!BN_to_felem(y_out, p->Y)) ||
1823 (!BN_to_felem(z_out, p->Z)))
1824 goto err;
1825 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1826 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1827 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1828 for (j = 2; j <= 16; ++j) {
1829 if (j & 1) {
1830 point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1831 pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0,
1832 pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1833 } else {
1834 point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835 pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1836 }
1837 }
1838 }
1839 }
1840 if (mixed)
1841 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842 }
1843
1844 /* the scalar for the generator */
1845 if (scalar != NULL && have_pre_comp) {
1846 memset(g_secret, 0, sizeof(g_secret));
1847 /* reduce scalar to 0 <= scalar < 2^384 */
1848 if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1849 /*
1850 * this is an unusual input, and we don't guarantee
1851 * constant-timeness
1852 */
1853 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1854 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1855 goto err;
1856 }
1857 num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1858 } else {
1859 num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1860 }
1861 /* do the multiplication with generator precomputation */
1862 batch_mul(x_out, y_out, z_out,
1863 (const felem_bytearray(*))secrets, num_points,
1864 g_secret,
1865 mixed, (const felem(*)[17][3])pre_comp,
1866 (const felem(*)[3])g_pre_comp);
1867 } else {
1868 /* do the multiplication without generator precomputation */
1869 batch_mul(x_out, y_out, z_out,
1870 (const felem_bytearray(*))secrets, num_points,
1871 NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1872 }
1873 /* reduce the output to its unique minimal representation */
1874 felem_contract(x_in, x_out);
1875 felem_contract(y_in, y_out);
1876 felem_contract(z_in, z_out);
1877 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1878 (!felem_to_BN(z, z_in))) {
1879 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1880 goto err;
1881 }
1882 ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1883 ctx);
1884
1885 err:
1886 BN_CTX_end(ctx);
1887 EC_POINT_free(generator);
1888 OPENSSL_free(secrets);
1889 OPENSSL_free(pre_comp);
1890 OPENSSL_free(tmp_felems);
1891 return ret;
1892 }
1893
1894 int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1895 {
1896 int ret = 0;
1897 NISTP384_PRE_COMP *pre = NULL;
1898 int i, j;
1899 BIGNUM *x, *y;
1900 EC_POINT *generator = NULL;
1901 felem tmp_felems[16];
1902 #ifndef FIPS_MODULE
1903 BN_CTX *new_ctx = NULL;
1904 #endif
1905
1906 /* throw away old precomputation */
1907 EC_pre_comp_free(group);
1908
1909 #ifndef FIPS_MODULE
1910 if (ctx == NULL)
1911 ctx = new_ctx = BN_CTX_new();
1912 #endif
1913 if (ctx == NULL)
1914 return 0;
1915
1916 BN_CTX_start(ctx);
1917 x = BN_CTX_get(ctx);
1918 y = BN_CTX_get(ctx);
1919 if (y == NULL)
1920 goto err;
1921 /* get the generator */
1922 if (group->generator == NULL)
1923 goto err;
1924 generator = EC_POINT_new(group);
1925 if (generator == NULL)
1926 goto err;
1927 BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1928 BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1929 if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1930 goto err;
1931 if ((pre = nistp384_pre_comp_new()) == NULL)
1932 goto err;
1933 /*
1934 * if the generator is the standard one, use built-in precomputation
1935 */
1936 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1937 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1938 goto done;
1939 }
1940 if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1941 (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1942 (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1943 goto err;
1944 /* compute 2^95*G, 2^190*G, 2^285*G */
1945 for (i = 1; i <= 4; i <<= 1) {
1946 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1947 pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1948 for (j = 0; j < 94; ++j) {
1949 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1950 pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1951 }
1952 }
1953 /* g_pre_comp[0] is the point at infinity */
1954 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1955 /* the remaining multiples */
1956 /* 2^95*G + 2^190*G */
1957 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2],
1958 pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0,
1959 pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1960 /* 2^95*G + 2^285*G */
1961 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1962 pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1963 pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1964 /* 2^190*G + 2^285*G */
1965 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1966 pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1967 pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]);
1968 /* 2^95*G + 2^190*G + 2^285*G */
1969 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1970 pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1971 pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1972 for (i = 1; i < 8; ++i) {
1973 /* odd multiples: add G */
1974 point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1975 pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
1976 pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]);
1977 }
1978 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1979
1980 done:
1981 SETPRECOMP(group, nistp384, pre);
1982 ret = 1;
1983 pre = NULL;
1984 err:
1985 BN_CTX_end(ctx);
1986 EC_POINT_free(generator);
1987 #ifndef FIPS_MODULE
1988 BN_CTX_free(new_ctx);
1989 #endif
1990 ossl_ec_nistp384_pre_comp_free(pre);
1991 return ret;
1992 }
1993
1994 int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
1995 {
1996 return HAVEPRECOMP(group, nistp384);
1997 }