]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/curve448/arch_64/f_impl.c
2b428cd686695af95a4b9feaeca52aa1161d54b6
[thirdparty/openssl.git] / crypto / ec / curve448 / arch_64 / f_impl.c
1 /*
2 * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
4 *
5 * Licensed under the OpenSSL license (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
9 *
10 * Originally written by Mike Hamburg
11 */
12
13 #include "../field.h"
14
15 void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
16 {
17 const uint64_t *a = as->limb, *b = bs->limb;
18 uint64_t *c = cs->limb;
19 uint128_t accum0 = 0, accum1 = 0, accum2;
20 uint64_t mask = (1ULL << 56) - 1;
21 uint64_t aa[4], bb[4], bbb[4];
22 unsigned int i, j;
23
24 for (i = 0; i < 4; i++) {
25 aa[i] = a[i] + a[i + 4];
26 bb[i] = b[i] + b[i + 4];
27 bbb[i] = bb[i] + b[i + 4];
28 }
29
30 for (i = 0; i < 4; i++) {
31 accum2 = 0;
32
33 for (j = 0; j <= i; j++) {
34 accum2 += widemul(a[j], b[i - j]);
35 accum1 += widemul(aa[j], bb[i - j]);
36 accum0 += widemul(a[j + 4], b[i - j + 4]);
37 }
38 for (; j < 4; j++) {
39 accum2 += widemul(a[j], b[i - j + 8]);
40 accum1 += widemul(aa[j], bbb[i - j + 4]);
41 accum0 += widemul(a[j + 4], bb[i - j + 4]);
42 }
43
44 accum1 -= accum2;
45 accum0 += accum2;
46
47 c[i] = ((uint64_t)(accum0)) & mask;
48 c[i + 4] = ((uint64_t)(accum1)) & mask;
49
50 accum0 >>= 56;
51 accum1 >>= 56;
52 }
53
54 accum0 += accum1;
55 accum0 += c[4];
56 accum1 += c[0];
57 c[4] = ((uint64_t)(accum0)) & mask;
58 c[0] = ((uint64_t)(accum1)) & mask;
59
60 accum0 >>= 56;
61 accum1 >>= 56;
62
63 c[5] += ((uint64_t)(accum0));
64 c[1] += ((uint64_t)(accum1));
65 }
66
67 void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
68 {
69 const uint64_t *a = as->limb;
70 uint64_t *c = cs->limb;
71 uint128_t accum0 = 0, accum4 = 0;
72 uint64_t mask = (1ULL << 56) - 1;
73 int i;
74
75 for (i = 0; i < 4; i++) {
76 accum0 += widemul(b, a[i]);
77 accum4 += widemul(b, a[i + 4]);
78 c[i] = accum0 & mask;
79 accum0 >>= 56;
80 c[i + 4] = accum4 & mask;
81 accum4 >>= 56;
82 }
83
84 accum0 += accum4 + c[4];
85 c[4] = accum0 & mask;
86 c[5] += accum0 >> 56;
87
88 accum4 += c[0];
89 c[0] = accum4 & mask;
90 c[1] += accum4 >> 56;
91 }
92
93 void gf_sqr(gf_s * RESTRICT cs, const gf as)
94 {
95 const uint64_t *a = as->limb;
96 uint64_t *c = cs->limb;
97 uint128_t accum0 = 0, accum1 = 0, accum2;
98 uint64_t mask = (1ULL << 56) - 1;
99 uint64_t aa[4];
100 unsigned int i;
101
102 /* For some reason clang doesn't vectorize this without prompting? */
103 for (i = 0; i < 4; i++)
104 aa[i] = a[i] + a[i + 4];
105
106 accum2 = widemul(a[0], a[3]);
107 accum0 = widemul(aa[0], aa[3]);
108 accum1 = widemul(a[4], a[7]);
109
110 accum2 += widemul(a[1], a[2]);
111 accum0 += widemul(aa[1], aa[2]);
112 accum1 += widemul(a[5], a[6]);
113
114 accum0 -= accum2;
115 accum1 += accum2;
116
117 c[3] = ((uint64_t)(accum1)) << 1 & mask;
118 c[7] = ((uint64_t)(accum0)) << 1 & mask;
119
120 accum0 >>= 55;
121 accum1 >>= 55;
122
123 accum0 += widemul(2 * aa[1], aa[3]);
124 accum1 += widemul(2 * a[5], a[7]);
125 accum0 += widemul(aa[2], aa[2]);
126 accum1 += accum0;
127
128 accum0 -= widemul(2 * a[1], a[3]);
129 accum1 += widemul(a[6], a[6]);
130
131 accum2 = widemul(a[0], a[0]);
132 accum1 -= accum2;
133 accum0 += accum2;
134
135 accum0 -= widemul(a[2], a[2]);
136 accum1 += widemul(aa[0], aa[0]);
137 accum0 += widemul(a[4], a[4]);
138
139 c[0] = ((uint64_t)(accum0)) & mask;
140 c[4] = ((uint64_t)(accum1)) & mask;
141
142 accum0 >>= 56;
143 accum1 >>= 56;
144
145 accum2 = widemul(2 * aa[2], aa[3]);
146 accum0 -= widemul(2 * a[2], a[3]);
147 accum1 += widemul(2 * a[6], a[7]);
148
149 accum1 += accum2;
150 accum0 += accum2;
151
152 accum2 = widemul(2 * a[0], a[1]);
153 accum1 += widemul(2 * aa[0], aa[1]);
154 accum0 += widemul(2 * a[4], a[5]);
155
156 accum1 -= accum2;
157 accum0 += accum2;
158
159 c[1] = ((uint64_t)(accum0)) & mask;
160 c[5] = ((uint64_t)(accum1)) & mask;
161
162 accum0 >>= 56;
163 accum1 >>= 56;
164
165 accum2 = widemul(aa[3], aa[3]);
166 accum0 -= widemul(a[3], a[3]);
167 accum1 += widemul(a[7], a[7]);
168
169 accum1 += accum2;
170 accum0 += accum2;
171
172 accum2 = widemul(2 * a[0], a[2]);
173 accum1 += widemul(2 * aa[0], aa[2]);
174 accum0 += widemul(2 * a[4], a[6]);
175
176 accum2 += widemul(a[1], a[1]);
177 accum1 += widemul(aa[1], aa[1]);
178 accum0 += widemul(a[5], a[5]);
179
180 accum1 -= accum2;
181 accum0 += accum2;
182
183 c[2] = ((uint64_t)(accum0)) & mask;
184 c[6] = ((uint64_t)(accum1)) & mask;
185
186 accum0 >>= 56;
187 accum1 >>= 56;
188
189 accum0 += c[3];
190 accum1 += c[7];
191 c[3] = ((uint64_t)(accum0)) & mask;
192 c[7] = ((uint64_t)(accum1)) & mask;
193
194 /* we could almost stop here, but it wouldn't be stable, so... */
195
196 accum0 >>= 56;
197 accum1 >>= 56;
198 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
199 c[0] += ((uint64_t)(accum1));
200 }