2 * Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
3 * Copyright 2014 Cryptography Research, Inc.
5 * Licensed under the Apache License 2.0 (the "License"). You may not use
6 * this file except in compliance with the License. You can obtain a copy
7 * in the file LICENSE in the source distribution or at
8 * https://www.openssl.org/source/license.html
10 * Originally written by Mike Hamburg
13 #include "internal/e_os.h"
14 #include <openssl/macros.h>
15 #include "internal/numbers.h"
18 /* No support for 128 bit ints, so do nothing here */
19 NON_EMPTY_TRANSLATION_UNIT
22 # include "../field.h"
24 void gf_mul(gf_s
* RESTRICT cs
, const gf as
, const gf bs
)
26 const uint64_t *a
= as
->limb
, *b
= bs
->limb
;
27 uint64_t *c
= cs
->limb
;
28 uint128_t accum0
= 0, accum1
= 0, accum2
;
29 uint64_t mask
= (1ULL << 56) - 1;
30 uint64_t aa
[4], bb
[4], bbb
[4];
33 for (i
= 0; i
< 4; i
++) {
34 aa
[i
] = a
[i
] + a
[i
+ 4];
35 bb
[i
] = b
[i
] + b
[i
+ 4];
36 bbb
[i
] = bb
[i
] + b
[i
+ 4];
39 for (i
= 0; i
< 4; i
++) {
42 for (j
= 0; j
<= i
; j
++) {
43 accum2
+= widemul(a
[j
], b
[i
- j
]);
44 accum1
+= widemul(aa
[j
], bb
[i
- j
]);
45 accum0
+= widemul(a
[j
+ 4], b
[i
- j
+ 4]);
48 accum2
+= widemul(a
[j
], b
[i
- j
+ 8]);
49 accum1
+= widemul(aa
[j
], bbb
[i
- j
+ 4]);
50 accum0
+= widemul(a
[j
+ 4], bb
[i
- j
+ 4]);
56 c
[i
] = ((uint64_t)(accum0
)) & mask
;
57 c
[i
+ 4] = ((uint64_t)(accum1
)) & mask
;
66 c
[4] = ((uint64_t)(accum0
)) & mask
;
67 c
[0] = ((uint64_t)(accum1
)) & mask
;
72 c
[5] += ((uint64_t)(accum0
));
73 c
[1] += ((uint64_t)(accum1
));
76 void gf_mulw_unsigned(gf_s
* RESTRICT cs
, const gf as
, uint32_t b
)
78 const uint64_t *a
= as
->limb
;
79 uint64_t *c
= cs
->limb
;
80 uint128_t accum0
= 0, accum4
= 0;
81 uint64_t mask
= (1ULL << 56) - 1;
84 for (i
= 0; i
< 4; i
++) {
85 accum0
+= widemul(b
, a
[i
]);
86 accum4
+= widemul(b
, a
[i
+ 4]);
89 c
[i
+ 4] = accum4
& mask
;
93 accum0
+= accum4
+ c
[4];
102 void gf_sqr(gf_s
* RESTRICT cs
, const gf as
)
104 const uint64_t *a
= as
->limb
;
105 uint64_t *c
= cs
->limb
;
106 uint128_t accum0
= 0, accum1
= 0, accum2
;
107 uint64_t mask
= (1ULL << 56) - 1;
111 /* For some reason clang doesn't vectorize this without prompting? */
112 for (i
= 0; i
< 4; i
++)
113 aa
[i
] = a
[i
] + a
[i
+ 4];
115 accum2
= widemul(a
[0], a
[3]);
116 accum0
= widemul(aa
[0], aa
[3]);
117 accum1
= widemul(a
[4], a
[7]);
119 accum2
+= widemul(a
[1], a
[2]);
120 accum0
+= widemul(aa
[1], aa
[2]);
121 accum1
+= widemul(a
[5], a
[6]);
126 c
[3] = ((uint64_t)(accum1
)) << 1 & mask
;
127 c
[7] = ((uint64_t)(accum0
)) << 1 & mask
;
132 accum0
+= widemul(2 * aa
[1], aa
[3]);
133 accum1
+= widemul(2 * a
[5], a
[7]);
134 accum0
+= widemul(aa
[2], aa
[2]);
137 accum0
-= widemul(2 * a
[1], a
[3]);
138 accum1
+= widemul(a
[6], a
[6]);
140 accum2
= widemul(a
[0], a
[0]);
144 accum0
-= widemul(a
[2], a
[2]);
145 accum1
+= widemul(aa
[0], aa
[0]);
146 accum0
+= widemul(a
[4], a
[4]);
148 c
[0] = ((uint64_t)(accum0
)) & mask
;
149 c
[4] = ((uint64_t)(accum1
)) & mask
;
154 accum2
= widemul(2 * aa
[2], aa
[3]);
155 accum0
-= widemul(2 * a
[2], a
[3]);
156 accum1
+= widemul(2 * a
[6], a
[7]);
161 accum2
= widemul(2 * a
[0], a
[1]);
162 accum1
+= widemul(2 * aa
[0], aa
[1]);
163 accum0
+= widemul(2 * a
[4], a
[5]);
168 c
[1] = ((uint64_t)(accum0
)) & mask
;
169 c
[5] = ((uint64_t)(accum1
)) & mask
;
174 accum2
= widemul(aa
[3], aa
[3]);
175 accum0
-= widemul(a
[3], a
[3]);
176 accum1
+= widemul(a
[7], a
[7]);
181 accum2
= widemul(2 * a
[0], a
[2]);
182 accum1
+= widemul(2 * aa
[0], aa
[2]);
183 accum0
+= widemul(2 * a
[4], a
[6]);
185 accum2
+= widemul(a
[1], a
[1]);
186 accum1
+= widemul(aa
[1], aa
[1]);
187 accum0
+= widemul(a
[5], a
[5]);
192 c
[2] = ((uint64_t)(accum0
)) & mask
;
193 c
[6] = ((uint64_t)(accum1
)) & mask
;
200 c
[3] = ((uint64_t)(accum0
)) & mask
;
201 c
[7] = ((uint64_t)(accum1
)) & mask
;
203 /* we could almost stop here, but it wouldn't be stable, so... */
207 c
[4] += ((uint64_t)(accum0
)) + ((uint64_t)(accum1
));
208 c
[0] += ((uint64_t)(accum1
));