]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/keccak1600.c
fix some code with obvious wrong coding style
[thirdparty/openssl.git] / crypto / sha / keccak1600.c
CommitLineData
b9feae1b
AP
1/*
2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 *
a598ed0d 4 * Licensed under the Apache License 2.0 (the "License"). You may not use
b9feae1b
AP
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
74a011eb 10#include <openssl/e_os2.h>
b9feae1b
AP
11#include <string.h>
12#include <assert.h>
13
c363ce55
AP
14size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
15 size_t r);
16void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
17
5d010e3f
AP
18#if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
19
20/*
21 * Choose some sensible defaults
22 */
23#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25# define KECCAK_2X /* default to KECCAK_2X variant */
26#endif
27
7dec815e
AP
28#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
29 (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
30 defined(__mips) || defined(__riscv) || defined(__s390__) || \
31 defined(__EMSCRIPTEN__)
32/*
33 * These don't have "and with complement" instruction, so minimize amount
34 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
35 */
5d010e3f
AP
36# define KECCAK_COMPLEMENTING_TRANSFORM
37#endif
71dd3b64 38
13603583
AP
39#if defined(__x86_64__) || defined(__aarch64__) || \
40 defined(__mips64) || defined(__ia64) || \
41 (defined(__VMS) && !defined(__vax))
42/*
43 * These are available even in ILP32 flavours, but even then they are
44 * capable of performing 64-bit operations as efficiently as in *P64.
45 * Since it's not given that we can use sizeof(void *), just shunt it.
46 */
47# define BIT_INTERLEAVE (0)
48#else
49# define BIT_INTERLEAVE (sizeof(void *) < 8)
50#endif
51
0dd0be94
AP
52#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
53
54static uint64_t ROL64(uint64_t val, int offset)
55{
56 if (offset == 0) {
57 return val;
13603583 58 } else if (!BIT_INTERLEAVE) {
0dd0be94
AP
59 return (val << offset) | (val >> (64-offset));
60 } else {
61 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
62
63 if (offset & 1) {
64 uint32_t tmp = hi;
65
66 offset >>= 1;
67 hi = ROL32(lo, offset);
68 lo = ROL32(tmp, offset + 1);
69 } else {
70 offset >>= 1;
71 lo = ROL32(lo, offset);
72 hi = ROL32(hi, offset);
73 }
74
75 return ((uint64_t)hi << 32) | lo;
76 }
77}
78
79static const unsigned char rhotates[5][5] = {
80 { 0, 1, 62, 28, 27 },
81 { 36, 44, 6, 55, 20 },
82 { 3, 10, 43, 25, 39 },
83 { 41, 45, 15, 21, 8 },
84 { 18, 2, 61, 56, 14 }
85};
86
87static const uint64_t iotas[] = {
98f29466
SL
88 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
89 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
90 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
91 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
92 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
93 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
94 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
95 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
96 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
97 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
98 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
99 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
100 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
101 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
102 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
103 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
104 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
105 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
106 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
107 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
108 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
109 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
110 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
111 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
0dd0be94 112};
b9feae1b 113
79dfc3dd
AP
114#if defined(KECCAK_REF)
115/*
116 * This is straightforward or "maximum clarity" implementation aiming
117 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
118 * Permutation-Based Hash and Extendible-Output Functions" as much as
119 * possible. With one caveat. Because of the way C stores matrices,
120 * references to A[x,y] in the specification are presented as A[y][x].
121 * Implementation unrolls inner x-loops so that modulo 5 operations are
122 * explicitly pre-computed.
123 */
b9feae1b
AP
124static void Theta(uint64_t A[5][5])
125{
126 uint64_t C[5], D[5];
127 size_t y;
128
79dfc3dd
AP
129 C[0] = A[0][0];
130 C[1] = A[0][1];
131 C[2] = A[0][2];
132 C[3] = A[0][3];
133 C[4] = A[0][4];
134
135 for (y = 1; y < 5; y++) {
136 C[0] ^= A[y][0];
137 C[1] ^= A[y][1];
138 C[2] ^= A[y][2];
139 C[3] ^= A[y][3];
140 C[4] ^= A[y][4];
141 }
b9feae1b
AP
142
143 D[0] = ROL64(C[1], 1) ^ C[4];
144 D[1] = ROL64(C[2], 1) ^ C[0];
145 D[2] = ROL64(C[3], 1) ^ C[1];
146 D[3] = ROL64(C[4], 1) ^ C[2];
147 D[4] = ROL64(C[0], 1) ^ C[3];
148
149 for (y = 0; y < 5; y++) {
150 A[y][0] ^= D[0];
151 A[y][1] ^= D[1];
152 A[y][2] ^= D[2];
153 A[y][3] ^= D[3];
154 A[y][4] ^= D[4];
155 }
156}
157
158static void Rho(uint64_t A[5][5])
159{
b9feae1b
AP
160 size_t y;
161
162 for (y = 0; y < 5; y++) {
163 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
164 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
165 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
166 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
167 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
168 }
169}
170
171static void Pi(uint64_t A[5][5])
172{
173 uint64_t T[5][5];
174
175 /*
176 * T = A
177 * A[y][x] = T[x][(3*y+x)%5]
178 */
179 memcpy(T, A, sizeof(T));
180
181 A[0][0] = T[0][0];
182 A[0][1] = T[1][1];
183 A[0][2] = T[2][2];
184 A[0][3] = T[3][3];
185 A[0][4] = T[4][4];
186
187 A[1][0] = T[0][3];
188 A[1][1] = T[1][4];
189 A[1][2] = T[2][0];
190 A[1][3] = T[3][1];
191 A[1][4] = T[4][2];
192
193 A[2][0] = T[0][1];
194 A[2][1] = T[1][2];
195 A[2][2] = T[2][3];
196 A[2][3] = T[3][4];
197 A[2][4] = T[4][0];
198
199 A[3][0] = T[0][4];
200 A[3][1] = T[1][0];
201 A[3][2] = T[2][1];
202 A[3][3] = T[3][2];
203 A[3][4] = T[4][3];
204
205 A[4][0] = T[0][2];
206 A[4][1] = T[1][3];
207 A[4][2] = T[2][4];
208 A[4][3] = T[3][0];
209 A[4][4] = T[4][1];
210}
211
212static void Chi(uint64_t A[5][5])
213{
214 uint64_t C[5];
215 size_t y;
216
217 for (y = 0; y < 5; y++) {
218 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
219 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
220 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
221 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
222 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
223
224 A[y][0] = C[0];
225 A[y][1] = C[1];
226 A[y][2] = C[2];
227 A[y][3] = C[3];
228 A[y][4] = C[4];
229 }
230}
231
232static void Iota(uint64_t A[5][5], size_t i)
233{
b9feae1b
AP
234 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
235 A[0][0] ^= iotas[i];
236}
237
b4f2a462 238static void KeccakF1600(uint64_t A[5][5])
b9feae1b
AP
239{
240 size_t i;
241
242 for (i = 0; i < 24; i++) {
243 Theta(A);
244 Rho(A);
245 Pi(A);
246 Chi(A);
247 Iota(A, i);
248 }
249}
250
79dfc3dd
AP
251#elif defined(KECCAK_1X)
252/*
253 * This implementation is optimization of above code featuring unroll
254 * of even y-loops, their fusion and code motion. It also minimizes
255 * temporary storage. Compiler would normally do all these things for
256 * you, purpose of manual optimization is to provide "unobscured"
257 * reference for assembly implementation [in case this approach is
258 * chosen for implementation on some platform]. In the nutshell it's
259 * equivalent of "plane-per-plane processing" approach discussed in
260 * section 2.4 of "Keccak implementation overview".
261 */
262static void Round(uint64_t A[5][5], size_t i)
263{
c83a4db5
AP
264 uint64_t C[5], E[2]; /* registers */
265 uint64_t D[5], T[2][5]; /* memory */
79dfc3dd
AP
266
267 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
268
269 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
270 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
271 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
272 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
273 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
274
c83a4db5
AP
275#if defined(__arm__)
276 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
277 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
278 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
279 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
280 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
281
282 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
283 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
284 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
285 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
286 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
287
288 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
289 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
290 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
291 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
292 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
293#else
79dfc3dd
AP
294 D[0] = ROL64(C[1], 1) ^ C[4];
295 D[1] = ROL64(C[2], 1) ^ C[0];
296 D[2] = ROL64(C[3], 1) ^ C[1];
297 D[3] = ROL64(C[4], 1) ^ C[2];
298 D[4] = ROL64(C[0], 1) ^ C[3];
299
79dfc3dd
AP
300 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
301 T[0][1] = A[0][1] ^ D[1];
302 T[0][2] = A[0][2] ^ D[2];
303 T[0][3] = A[0][3] ^ D[3];
304 T[0][4] = A[0][4] ^ D[4];
305
c83a4db5
AP
306 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
307 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
308 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
309 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
310 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
311#endif
79dfc3dd
AP
312 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
313 A[0][1] = C[1] ^ (~C[2] & C[3]);
314 A[0][2] = C[2] ^ (~C[3] & C[4]);
315 A[0][3] = C[3] ^ (~C[4] & C[0]);
316 A[0][4] = C[4] ^ (~C[0] & C[1]);
317
c83a4db5
AP
318 T[1][0] = A[1][0] ^ (C[3] = D[0]);
319 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
320 T[1][2] = A[1][2] ^ (E[0] = D[2]);
321 T[1][3] = A[1][3] ^ (E[1] = D[3]);
322 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
79dfc3dd 323
c83a4db5
AP
324 C[0] = ROL64(T[0][3], rhotates[0][3]);
325 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
326 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
327 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
328 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
79dfc3dd
AP
329
330 A[1][0] = C[0] ^ (~C[1] & C[2]);
331 A[1][1] = C[1] ^ (~C[2] & C[3]);
332 A[1][2] = C[2] ^ (~C[3] & C[4]);
333 A[1][3] = C[3] ^ (~C[4] & C[0]);
334 A[1][4] = C[4] ^ (~C[0] & C[1]);
335
336 C[0] = ROL64(T[0][1], rhotates[0][1]);
337 C[1] = ROL64(T[1][2], rhotates[1][2]);
338 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
339 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
340 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
341
342 A[2][0] = C[0] ^ (~C[1] & C[2]);
343 A[2][1] = C[1] ^ (~C[2] & C[3]);
344 A[2][2] = C[2] ^ (~C[3] & C[4]);
345 A[2][3] = C[3] ^ (~C[4] & C[0]);
346 A[2][4] = C[4] ^ (~C[0] & C[1]);
347
348 C[0] = ROL64(T[0][4], rhotates[0][4]);
349 C[1] = ROL64(T[1][0], rhotates[1][0]);
350 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
351 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
352 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
353
354 A[3][0] = C[0] ^ (~C[1] & C[2]);
355 A[3][1] = C[1] ^ (~C[2] & C[3]);
356 A[3][2] = C[2] ^ (~C[3] & C[4]);
357 A[3][3] = C[3] ^ (~C[4] & C[0]);
358 A[3][4] = C[4] ^ (~C[0] & C[1]);
359
360 C[0] = ROL64(T[0][2], rhotates[0][2]);
361 C[1] = ROL64(T[1][3], rhotates[1][3]);
362 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
363 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
364 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
365
366 A[4][0] = C[0] ^ (~C[1] & C[2]);
367 A[4][1] = C[1] ^ (~C[2] & C[3]);
368 A[4][2] = C[2] ^ (~C[3] & C[4]);
369 A[4][3] = C[3] ^ (~C[4] & C[0]);
370 A[4][4] = C[4] ^ (~C[0] & C[1]);
371}
372
b4f2a462 373static void KeccakF1600(uint64_t A[5][5])
79dfc3dd
AP
374{
375 size_t i;
376
377 for (i = 0; i < 24; i++) {
378 Round(A, i);
379 }
380}
381
1ded2dd3
AP
382#elif defined(KECCAK_1X_ALT)
383/*
22f9fa6e
AP
384 * This is variant of above KECCAK_1X that reduces requirement for
385 * temporary storage even further, but at cost of more updates to A[][].
386 * It's less suitable if A[][] is memory bound, but better if it's
1ded2dd3
AP
387 * register bound.
388 */
389
390static void Round(uint64_t A[5][5], size_t i)
391{
392 uint64_t C[5], D[5];
393
394 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
395
396 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
397 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
398 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
399 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
400 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
401
22f9fa6e
AP
402 D[1] = C[0] ^ ROL64(C[2], 1);
403 D[2] = C[1] ^ ROL64(C[3], 1);
404 D[3] = C[2] ^= ROL64(C[4], 1);
405 D[4] = C[3] ^= ROL64(C[0], 1);
406 D[0] = C[4] ^= ROL64(C[1], 1);
1ded2dd3 407
1ded2dd3
AP
408 A[0][1] ^= D[1];
409 A[1][1] ^= D[1];
410 A[2][1] ^= D[1];
411 A[3][1] ^= D[1];
412 A[4][1] ^= D[1];
413
1ded2dd3
AP
414 A[0][2] ^= D[2];
415 A[1][2] ^= D[2];
416 A[2][2] ^= D[2];
417 A[3][2] ^= D[2];
418 A[4][2] ^= D[2];
419
22f9fa6e
AP
420 A[0][3] ^= C[2];
421 A[1][3] ^= C[2];
422 A[2][3] ^= C[2];
423 A[3][3] ^= C[2];
424 A[4][3] ^= C[2];
1ded2dd3 425
22f9fa6e
AP
426 A[0][4] ^= C[3];
427 A[1][4] ^= C[3];
428 A[2][4] ^= C[3];
429 A[3][4] ^= C[3];
430 A[4][4] ^= C[3];
431
432 A[0][0] ^= C[4];
433 A[1][0] ^= C[4];
434 A[2][0] ^= C[4];
435 A[3][0] ^= C[4];
436 A[4][0] ^= C[4];
1ded2dd3
AP
437
438 C[1] = A[0][1];
439 C[2] = A[0][2];
440 C[3] = A[0][3];
441 C[4] = A[0][4];
442
443 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
444 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
445 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
446 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
447
448 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
449 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
450 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
451 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
452
453 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
454 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
455 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
456 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
457
458 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
459 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
460 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
461 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
462
463 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
464 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
465 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
466 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
467
468 A[1][0] = ROL64(C[3], rhotates[0][3]);
469 A[2][0] = ROL64(C[1], rhotates[0][1]);
470 A[3][0] = ROL64(C[4], rhotates[0][4]);
471 A[4][0] = ROL64(C[2], rhotates[0][2]);
472
473 C[0] = A[0][0];
474 C[1] = A[1][0];
1ded2dd3
AP
475 D[0] = A[0][1];
476 D[1] = A[1][1];
1ded2dd3
AP
477
478 A[0][0] ^= (~A[0][1] & A[0][2]);
479 A[1][0] ^= (~A[1][1] & A[1][2]);
1ded2dd3
AP
480 A[0][1] ^= (~A[0][2] & A[0][3]);
481 A[1][1] ^= (~A[1][2] & A[1][3]);
1ded2dd3
AP
482 A[0][2] ^= (~A[0][3] & A[0][4]);
483 A[1][2] ^= (~A[1][3] & A[1][4]);
1ded2dd3
AP
484 A[0][3] ^= (~A[0][4] & C[0]);
485 A[1][3] ^= (~A[1][4] & C[1]);
1ded2dd3
AP
486 A[0][4] ^= (~C[0] & D[0]);
487 A[1][4] ^= (~C[1] & D[1]);
22f9fa6e
AP
488
489 C[2] = A[2][0];
490 C[3] = A[3][0];
491 D[2] = A[2][1];
492 D[3] = A[3][1];
493
494 A[2][0] ^= (~A[2][1] & A[2][2]);
495 A[3][0] ^= (~A[3][1] & A[3][2]);
496 A[2][1] ^= (~A[2][2] & A[2][3]);
497 A[3][1] ^= (~A[3][2] & A[3][3]);
498 A[2][2] ^= (~A[2][3] & A[2][4]);
499 A[3][2] ^= (~A[3][3] & A[3][4]);
500 A[2][3] ^= (~A[2][4] & C[2]);
501 A[3][3] ^= (~A[3][4] & C[3]);
1ded2dd3
AP
502 A[2][4] ^= (~C[2] & D[2]);
503 A[3][4] ^= (~C[3] & D[3]);
1ded2dd3 504
22f9fa6e
AP
505 C[4] = A[4][0];
506 D[4] = A[4][1];
507
508 A[4][0] ^= (~A[4][1] & A[4][2]);
509 A[4][1] ^= (~A[4][2] & A[4][3]);
510 A[4][2] ^= (~A[4][3] & A[4][4]);
511 A[4][3] ^= (~A[4][4] & C[4]);
512 A[4][4] ^= (~C[4] & D[4]);
1ded2dd3
AP
513 A[0][0] ^= iotas[i];
514}
515
b4f2a462 516static void KeccakF1600(uint64_t A[5][5])
1ded2dd3
AP
517{
518 size_t i;
519
520 for (i = 0; i < 24; i++) {
521 Round(A, i);
522 }
523}
524
79dfc3dd
AP
525#elif defined(KECCAK_2X)
526/*
527 * This implementation is variant of KECCAK_1X above with outer-most
528 * round loop unrolled twice. This allows to take temporary storage
529 * out of round procedure and simplify references to it by alternating
5d010e3f
AP
530 * it with actual data (see round loop below). Originally it was meant
531 * rather as reference for an assembly implementation, but it seems to
532 * play best with compilers [as well as provide best instruction per
533 * processed byte ratio at minimal round unroll factor]...
79dfc3dd
AP
534 */
535static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
536{
537 uint64_t C[5], D[5];
79dfc3dd
AP
538
539 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
540
541 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
542 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
543 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
544 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
545 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
546
547 D[0] = ROL64(C[1], 1) ^ C[4];
548 D[1] = ROL64(C[2], 1) ^ C[0];
549 D[2] = ROL64(C[3], 1) ^ C[1];
550 D[3] = ROL64(C[4], 1) ^ C[2];
551 D[4] = ROL64(C[0], 1) ^ C[3];
552
553 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
554 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
555 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
556 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
557 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
558
1f2aff25
AP
559#ifdef KECCAK_COMPLEMENTING_TRANSFORM
560 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
561 R[0][1] = C[1] ^ (~C[2] | C[3]);
562 R[0][2] = C[2] ^ ( C[3] & C[4]);
563 R[0][3] = C[3] ^ ( C[4] | C[0]);
564 R[0][4] = C[4] ^ ( C[0] & C[1]);
565#else
79dfc3dd
AP
566 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
567 R[0][1] = C[1] ^ (~C[2] & C[3]);
568 R[0][2] = C[2] ^ (~C[3] & C[4]);
569 R[0][3] = C[3] ^ (~C[4] & C[0]);
570 R[0][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 571#endif
79dfc3dd
AP
572
573 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
574 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
575 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
576 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
577 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
578
1f2aff25
AP
579#ifdef KECCAK_COMPLEMENTING_TRANSFORM
580 R[1][0] = C[0] ^ (C[1] | C[2]);
581 R[1][1] = C[1] ^ (C[2] & C[3]);
582 R[1][2] = C[2] ^ (C[3] | ~C[4]);
583 R[1][3] = C[3] ^ (C[4] | C[0]);
584 R[1][4] = C[4] ^ (C[0] & C[1]);
585#else
79dfc3dd
AP
586 R[1][0] = C[0] ^ (~C[1] & C[2]);
587 R[1][1] = C[1] ^ (~C[2] & C[3]);
588 R[1][2] = C[2] ^ (~C[3] & C[4]);
589 R[1][3] = C[3] ^ (~C[4] & C[0]);
590 R[1][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 591#endif
79dfc3dd
AP
592
593 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
594 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
595 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
596 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
597 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
598
1f2aff25
AP
599#ifdef KECCAK_COMPLEMENTING_TRANSFORM
600 R[2][0] = C[0] ^ ( C[1] | C[2]);
601 R[2][1] = C[1] ^ ( C[2] & C[3]);
602 R[2][2] = C[2] ^ (~C[3] & C[4]);
603 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
604 R[2][4] = C[4] ^ ( C[0] & C[1]);
605#else
79dfc3dd
AP
606 R[2][0] = C[0] ^ (~C[1] & C[2]);
607 R[2][1] = C[1] ^ (~C[2] & C[3]);
608 R[2][2] = C[2] ^ (~C[3] & C[4]);
609 R[2][3] = C[3] ^ (~C[4] & C[0]);
610 R[2][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 611#endif
79dfc3dd
AP
612
613 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
614 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
615 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
616 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
617 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
618
1f2aff25
AP
619#ifdef KECCAK_COMPLEMENTING_TRANSFORM
620 R[3][0] = C[0] ^ ( C[1] & C[2]);
621 R[3][1] = C[1] ^ ( C[2] | C[3]);
622 R[3][2] = C[2] ^ (~C[3] | C[4]);
623 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
624 R[3][4] = C[4] ^ ( C[0] | C[1]);
625#else
79dfc3dd
AP
626 R[3][0] = C[0] ^ (~C[1] & C[2]);
627 R[3][1] = C[1] ^ (~C[2] & C[3]);
628 R[3][2] = C[2] ^ (~C[3] & C[4]);
629 R[3][3] = C[3] ^ (~C[4] & C[0]);
630 R[3][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 631#endif
79dfc3dd
AP
632
633 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
634 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
635 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
636 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
637 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
638
1f2aff25
AP
639#ifdef KECCAK_COMPLEMENTING_TRANSFORM
640 R[4][0] = C[0] ^ (~C[1] & C[2]);
641 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
642 R[4][2] = C[2] ^ ( C[3] & C[4]);
643 R[4][3] = C[3] ^ ( C[4] | C[0]);
644 R[4][4] = C[4] ^ ( C[0] & C[1]);
645#else
79dfc3dd
AP
646 R[4][0] = C[0] ^ (~C[1] & C[2]);
647 R[4][1] = C[1] ^ (~C[2] & C[3]);
648 R[4][2] = C[2] ^ (~C[3] & C[4]);
649 R[4][3] = C[3] ^ (~C[4] & C[0]);
650 R[4][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 651#endif
79dfc3dd
AP
652}
653
b4f2a462 654static void KeccakF1600(uint64_t A[5][5])
79dfc3dd
AP
655{
656 uint64_t T[5][5];
657 size_t i;
658
1f2aff25
AP
659#ifdef KECCAK_COMPLEMENTING_TRANSFORM
660 A[0][1] = ~A[0][1];
661 A[0][2] = ~A[0][2];
662 A[1][3] = ~A[1][3];
663 A[2][2] = ~A[2][2];
664 A[3][2] = ~A[3][2];
665 A[4][0] = ~A[4][0];
666#endif
667
79dfc3dd
AP
668 for (i = 0; i < 24; i += 2) {
669 Round(T, A, i);
670 Round(A, T, i + 1);
671 }
1f2aff25
AP
672
673#ifdef KECCAK_COMPLEMENTING_TRANSFORM
674 A[0][1] = ~A[0][1];
675 A[0][2] = ~A[0][2];
676 A[1][3] = ~A[1][3];
677 A[2][2] = ~A[2][2];
678 A[3][2] = ~A[3][2];
679 A[4][0] = ~A[4][0];
680#endif
79dfc3dd
AP
681}
682
5d010e3f 683#else /* define KECCAK_INPLACE to compile this code path */
79dfc3dd
AP
684/*
685 * This implementation is KECCAK_1X from above combined 4 times with
686 * a twist that allows to omit temporary storage and perform in-place
687 * processing. It's discussed in section 2.5 of "Keccak implementation
688 * overview". It's likely to be best suited for processors with large
5d010e3f
AP
689 * register bank... On the other hand processor with large register
690 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
691 * more compact...
79dfc3dd
AP
692 */
693static void FourRounds(uint64_t A[5][5], size_t i)
694{
695 uint64_t B[5], C[5], D[5];
79dfc3dd
AP
696
697 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
698
699 /* Round 4*n */
700 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
701 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
702 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
703 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
704 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
705
706 D[0] = ROL64(C[1], 1) ^ C[4];
707 D[1] = ROL64(C[2], 1) ^ C[0];
708 D[2] = ROL64(C[3], 1) ^ C[1];
709 D[3] = ROL64(C[4], 1) ^ C[2];
710 D[4] = ROL64(C[0], 1) ^ C[3];
711
712 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
713 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
714 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
715 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
716 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
717
718 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
719 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
720 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
721 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
722 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
723
724 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
725 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
726 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
727 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
728 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
729
730 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
731 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
732 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
733 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
734 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
735
736 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
737 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
738 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
739 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
740 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
741
742 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
743 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
744 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
745 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
746 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
747
748 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
749 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
750 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
751 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
752 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
753
754 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
755 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
756 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
757 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
758 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
759
760 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
761 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
762 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
763 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
764 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
765
766 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
767 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
768 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
769 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
770 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
771
772 /* Round 4*n+1 */
773 D[0] = ROL64(C[1], 1) ^ C[4];
774 D[1] = ROL64(C[2], 1) ^ C[0];
775 D[2] = ROL64(C[3], 1) ^ C[1];
776 D[3] = ROL64(C[4], 1) ^ C[2];
777 D[4] = ROL64(C[0], 1) ^ C[3];
778
779 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
780 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
781 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
782 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
783 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
784
785 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
786 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
787 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
788 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
789 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
790
791 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
792 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
793 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
794 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
795 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
796
797 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
798 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
799 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
800 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
801 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
802
803 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
804 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
805 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
806 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
807 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
808
809 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
810 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
811 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
812 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
813 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
814
815 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
816 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
817 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
818 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
819 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
820
821 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
822 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
823 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
824 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
825 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
826
827 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
828 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
829 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
830 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
831 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
832
833 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
834 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
835 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
836 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
837 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
838
839 /* Round 4*n+2 */
840 D[0] = ROL64(C[1], 1) ^ C[4];
841 D[1] = ROL64(C[2], 1) ^ C[0];
842 D[2] = ROL64(C[3], 1) ^ C[1];
843 D[3] = ROL64(C[4], 1) ^ C[2];
844 D[4] = ROL64(C[0], 1) ^ C[3];
845
846 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
847 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
848 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
849 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
850 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
851
852 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
853 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
854 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
855 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
856 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
857
858 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
859 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
860 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
861 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
862 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
863
864 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
865 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
866 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
867 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
868 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
869
870 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
871 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
872 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
873 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
874 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
875
876 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
877 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
878 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
879 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
880 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
881
882 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
883 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
884 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
885 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
886 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
887
888 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
889 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
890 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
891 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
892 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
893
894 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
895 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
896 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
897 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
898 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
899
900 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
901 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
902 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
903 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
904 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
905
906 /* Round 4*n+3 */
907 D[0] = ROL64(C[1], 1) ^ C[4];
908 D[1] = ROL64(C[2], 1) ^ C[0];
909 D[2] = ROL64(C[3], 1) ^ C[1];
910 D[3] = ROL64(C[4], 1) ^ C[2];
911 D[4] = ROL64(C[0], 1) ^ C[3];
912
913 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
914 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
915 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
916 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
917 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
918
919 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
920 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
921 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
922 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
923 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
924
925 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
926 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
927 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
928 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
929 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
930
931 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
932 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
933 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
934 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
935 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
936
937 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
938 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
939 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
940 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
941 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
942
943 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
944 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
945 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
946 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
947 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
948
949 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
950 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
951 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
952 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
953 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
954
955 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
956 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
957 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
958 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
959 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
960
961 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
962 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
963 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
964 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
965 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
966
967 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
968 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
969 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
970 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
971 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
972}
973
b4f2a462 974static void KeccakF1600(uint64_t A[5][5])
79dfc3dd
AP
975{
976 size_t i;
977
978 for (i = 0; i < 24; i += 4) {
979 FourRounds(A, i);
980 }
981}
982
983#endif
984
0dd0be94
AP
985static uint64_t BitInterleave(uint64_t Ai)
986{
13603583
AP
987 if (BIT_INTERLEAVE) {
988 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
989 uint32_t t0, t1;
990
991 t0 = lo & 0x55555555;
992 t0 |= t0 >> 1; t0 &= 0x33333333;
993 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
994 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
995 t0 |= t0 >> 8; t0 &= 0x0000ffff;
996
997 t1 = hi & 0x55555555;
998 t1 |= t1 >> 1; t1 &= 0x33333333;
999 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
1000 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
1001 t1 |= t1 >> 8; t1 <<= 16;
1002
1003 lo &= 0xaaaaaaaa;
1004 lo |= lo << 1; lo &= 0xcccccccc;
1005 lo |= lo << 2; lo &= 0xf0f0f0f0;
1006 lo |= lo << 4; lo &= 0xff00ff00;
1007 lo |= lo << 8; lo >>= 16;
1008
1009 hi &= 0xaaaaaaaa;
1010 hi |= hi << 1; hi &= 0xcccccccc;
1011 hi |= hi << 2; hi &= 0xf0f0f0f0;
1012 hi |= hi << 4; hi &= 0xff00ff00;
1013 hi |= hi << 8; hi &= 0xffff0000;
1014
1015 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
0dd0be94
AP
1016 }
1017
1018 return Ai;
1019}
1020
1021static uint64_t BitDeinterleave(uint64_t Ai)
1022{
13603583 1023 if (BIT_INTERLEAVE) {
0dd0be94 1024 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
13603583
AP
1025 uint32_t t0, t1;
1026
1027 t0 = lo & 0x0000ffff;
1028 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1029 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1030 t0 |= t0 << 2; t0 &= 0x33333333;
1031 t0 |= t0 << 1; t0 &= 0x55555555;
1032
1033 t1 = hi << 16;
1034 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1035 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1036 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1037 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1038
1039 lo >>= 16;
1040 lo |= lo << 8; lo &= 0x00ff00ff;
1041 lo |= lo << 4; lo &= 0x0f0f0f0f;
1042 lo |= lo << 2; lo &= 0x33333333;
1043 lo |= lo << 1; lo &= 0x55555555;
1044
1045 hi &= 0xffff0000;
1046 hi |= hi >> 8; hi &= 0xff00ff00;
1047 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1048 hi |= hi >> 2; hi &= 0xcccccccc;
1049 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1050
1051 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
0dd0be94
AP
1052 }
1053
1054 return Ai;
1055}
1056
b9feae1b
AP
1057/*
1058 * SHA3_absorb can be called multiple times, but at each invocation
1059 * largest multiple of |r| out of |len| bytes are processed. Then
c83a4db5
AP
1060 * remaining amount of bytes is returned. This is done to spare caller
1061 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1062 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1063 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
b9feae1b 1064 * padding and intermediate sub-block buffering, byte- or bitwise, is
46f4e1be 1065 * caller's responsibility.
b9feae1b
AP
1066 */
1067size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1068 size_t r)
1069{
1070 uint64_t *A_flat = (uint64_t *)A;
1071 size_t i, w = r / 8;
1072
4b904301
AP
1073 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1074
b9feae1b
AP
1075 while (len >= r) {
1076 for (i = 0; i < w; i++) {
0dd0be94
AP
1077 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1078 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1079 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1080 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
b9feae1b 1081 inp += 8;
0dd0be94
AP
1082
1083 A_flat[i] ^= BitInterleave(Ai);
b9feae1b
AP
1084 }
1085 KeccakF1600(A);
1086 len -= r;
1087 }
1088
1089 return len;
1090}
1091
1092/*
d5e5e2ff 1093 * sha3_squeeze is called once at the end to generate |out| hash value
b9feae1b
AP
1094 * of |len| bytes.
1095 */
1096void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1097{
1098 uint64_t *A_flat = (uint64_t *)A;
b4f2a462 1099 size_t i, w = r / 8;
b9feae1b 1100
4b904301
AP
1101 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102
b4f2a462
AP
1103 while (len != 0) {
1104 for (i = 0; i < w && len != 0; i++) {
0dd0be94 1105 uint64_t Ai = BitDeinterleave(A_flat[i]);
b9feae1b 1106
b4f2a462
AP
1107 if (len < 8) {
1108 for (i = 0; i < len; i++) {
1109 *out++ = (unsigned char)Ai;
1110 Ai >>= 8;
1111 }
1112 return;
1113 }
1114
b9feae1b
AP
1115 out[0] = (unsigned char)(Ai);
1116 out[1] = (unsigned char)(Ai >> 8);
1117 out[2] = (unsigned char)(Ai >> 16);
1118 out[3] = (unsigned char)(Ai >> 24);
1119 out[4] = (unsigned char)(Ai >> 32);
1120 out[5] = (unsigned char)(Ai >> 40);
1121 out[6] = (unsigned char)(Ai >> 48);
1122 out[7] = (unsigned char)(Ai >> 56);
1123 out += 8;
b4f2a462 1124 len -= 8;
b9feae1b 1125 }
b9feae1b
AP
1126 if (len)
1127 KeccakF1600(A);
1128 }
b9feae1b 1129}
71dd3b64 1130#endif
b9feae1b
AP
1131
1132#ifdef SELFTEST
1133/*
1134 * Post-padding one-shot implementations would look as following:
1135 *
1136 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1137 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1138 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1139 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1140 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1141 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1142 */
1143
1144void SHA3_sponge(const unsigned char *inp, size_t len,
1145 unsigned char *out, size_t d, size_t r)
1146{
1147 uint64_t A[5][5];
1148
1149 memset(A, 0, sizeof(A));
1150 SHA3_absorb(A, inp, len, r);
1151 SHA3_squeeze(A, out, d, r);
1152}
1153
1154# include <stdio.h>
1155
1156int main()
1157{
c3086f46
AP
1158 /*
1159 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1160 */
b9feae1b
AP
1161 unsigned char test[168] = { '\xf3', '\x3' };
1162 unsigned char out[512];
1163 size_t i;
c3086f46
AP
1164 static const unsigned char result[512] = {
1165 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1166 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1167 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1168 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1169 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1170 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1171 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1172 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1173 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1174 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1175 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1176 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1177 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1178 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1179 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1180 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1181 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1182 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1183 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1184 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1185 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1186 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1187 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1188 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1189 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1190 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1191 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1192 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1193 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1194 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1195 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1196 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1197 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1198 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1199 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1200 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1201 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1202 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1203 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1204 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1205 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1206 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1207 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1208 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1209 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1210 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1211 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1212 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1213 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1214 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1215 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1216 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1217 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1218 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1219 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1220 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1221 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1222 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1223 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1224 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1225 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1226 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1227 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1228 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1229 };
b9feae1b 1230
b9feae1b
AP
1231 test[167] = '\x80';
1232 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1233
c3086f46
AP
1234 /*
1235 * Rationale behind keeping output [formatted as below] is that
1236 * one should be able to redirect it to a file, then copy-n-paste
1237 * final "output val" from official example to another file, and
1238 * compare the two with diff(1).
1239 */
b9feae1b
AP
1240 for (i = 0; i < sizeof(out);) {
1241 printf("%02X", out[i]);
1242 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1243 }
c3086f46 1244
1287dabd 1245 if (memcmp(out, result, sizeof(out))) {
1246 fprintf(stderr, "failure\n");
c3086f46
AP
1247 return 1;
1248 } else {
1287dabd 1249 fprintf(stderr, "success\n");
c3086f46
AP
1250 return 0;
1251 }
b9feae1b
AP
1252}
1253#endif