]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/keccak1600.c
Following the license change, modify the boilerplates in crypto/sha/
[thirdparty/openssl.git] / crypto / sha / keccak1600.c
CommitLineData
b9feae1b
AP
1/*
2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 *
a598ed0d 4 * Licensed under the Apache License 2.0 (the "License"). You may not use
b9feae1b
AP
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
74a011eb 10#include <openssl/e_os2.h>
b9feae1b
AP
11#include <string.h>
12#include <assert.h>
13
c363ce55
AP
14size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
15 size_t r);
16void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
17
5d010e3f
AP
18#if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
19
20/*
21 * Choose some sensible defaults
22 */
23#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25# define KECCAK_2X /* default to KECCAK_2X variant */
26#endif
27
28#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
29# define KECCAK_COMPLEMENTING_TRANSFORM
30#endif
71dd3b64 31
13603583
AP
32#if defined(__x86_64__) || defined(__aarch64__) || \
33 defined(__mips64) || defined(__ia64) || \
34 (defined(__VMS) && !defined(__vax))
35/*
36 * These are available even in ILP32 flavours, but even then they are
37 * capable of performing 64-bit operations as efficiently as in *P64.
38 * Since it's not given that we can use sizeof(void *), just shunt it.
39 */
40# define BIT_INTERLEAVE (0)
41#else
42# define BIT_INTERLEAVE (sizeof(void *) < 8)
43#endif
44
0dd0be94
AP
45#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
46
47static uint64_t ROL64(uint64_t val, int offset)
48{
49 if (offset == 0) {
50 return val;
13603583 51 } else if (!BIT_INTERLEAVE) {
0dd0be94
AP
52 return (val << offset) | (val >> (64-offset));
53 } else {
54 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
55
56 if (offset & 1) {
57 uint32_t tmp = hi;
58
59 offset >>= 1;
60 hi = ROL32(lo, offset);
61 lo = ROL32(tmp, offset + 1);
62 } else {
63 offset >>= 1;
64 lo = ROL32(lo, offset);
65 hi = ROL32(hi, offset);
66 }
67
68 return ((uint64_t)hi << 32) | lo;
69 }
70}
71
72static const unsigned char rhotates[5][5] = {
73 { 0, 1, 62, 28, 27 },
74 { 36, 44, 6, 55, 20 },
75 { 3, 10, 43, 25, 39 },
76 { 41, 45, 15, 21, 8 },
77 { 18, 2, 61, 56, 14 }
78};
79
80static const uint64_t iotas[] = {
13603583
AP
81 BIT_INTERLEAVE ? 0x0000000000000001U : 0x0000000000000001U,
82 BIT_INTERLEAVE ? 0x0000008900000000U : 0x0000000000008082U,
83 BIT_INTERLEAVE ? 0x8000008b00000000U : 0x800000000000808aU,
84 BIT_INTERLEAVE ? 0x8000808000000000U : 0x8000000080008000U,
85 BIT_INTERLEAVE ? 0x0000008b00000001U : 0x000000000000808bU,
86 BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
87 BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
88 BIT_INTERLEAVE ? 0x8000008200000001U : 0x8000000000008009U,
89 BIT_INTERLEAVE ? 0x0000000b00000000U : 0x000000000000008aU,
90 BIT_INTERLEAVE ? 0x0000000a00000000U : 0x0000000000000088U,
91 BIT_INTERLEAVE ? 0x0000808200000001U : 0x0000000080008009U,
92 BIT_INTERLEAVE ? 0x0000800300000000U : 0x000000008000000aU,
93 BIT_INTERLEAVE ? 0x0000808b00000001U : 0x000000008000808bU,
94 BIT_INTERLEAVE ? 0x8000000b00000001U : 0x800000000000008bU,
95 BIT_INTERLEAVE ? 0x8000008a00000001U : 0x8000000000008089U,
96 BIT_INTERLEAVE ? 0x8000008100000001U : 0x8000000000008003U,
97 BIT_INTERLEAVE ? 0x8000008100000000U : 0x8000000000008002U,
98 BIT_INTERLEAVE ? 0x8000000800000000U : 0x8000000000000080U,
99 BIT_INTERLEAVE ? 0x0000008300000000U : 0x000000000000800aU,
100 BIT_INTERLEAVE ? 0x8000800300000000U : 0x800000008000000aU,
101 BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
102 BIT_INTERLEAVE ? 0x8000008800000000U : 0x8000000000008080U,
103 BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
104 BIT_INTERLEAVE ? 0x8000808200000000U : 0x8000000080008008U
0dd0be94 105};
b9feae1b 106
79dfc3dd
AP
107#if defined(KECCAK_REF)
108/*
109 * This is straightforward or "maximum clarity" implementation aiming
110 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
111 * Permutation-Based Hash and Extendible-Output Functions" as much as
112 * possible. With one caveat. Because of the way C stores matrices,
113 * references to A[x,y] in the specification are presented as A[y][x].
114 * Implementation unrolls inner x-loops so that modulo 5 operations are
115 * explicitly pre-computed.
116 */
b9feae1b
AP
117static void Theta(uint64_t A[5][5])
118{
119 uint64_t C[5], D[5];
120 size_t y;
121
79dfc3dd
AP
122 C[0] = A[0][0];
123 C[1] = A[0][1];
124 C[2] = A[0][2];
125 C[3] = A[0][3];
126 C[4] = A[0][4];
127
128 for (y = 1; y < 5; y++) {
129 C[0] ^= A[y][0];
130 C[1] ^= A[y][1];
131 C[2] ^= A[y][2];
132 C[3] ^= A[y][3];
133 C[4] ^= A[y][4];
134 }
b9feae1b
AP
135
136 D[0] = ROL64(C[1], 1) ^ C[4];
137 D[1] = ROL64(C[2], 1) ^ C[0];
138 D[2] = ROL64(C[3], 1) ^ C[1];
139 D[3] = ROL64(C[4], 1) ^ C[2];
140 D[4] = ROL64(C[0], 1) ^ C[3];
141
142 for (y = 0; y < 5; y++) {
143 A[y][0] ^= D[0];
144 A[y][1] ^= D[1];
145 A[y][2] ^= D[2];
146 A[y][3] ^= D[3];
147 A[y][4] ^= D[4];
148 }
149}
150
151static void Rho(uint64_t A[5][5])
152{
b9feae1b
AP
153 size_t y;
154
155 for (y = 0; y < 5; y++) {
156 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
157 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
158 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
159 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
160 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
161 }
162}
163
164static void Pi(uint64_t A[5][5])
165{
166 uint64_t T[5][5];
167
168 /*
169 * T = A
170 * A[y][x] = T[x][(3*y+x)%5]
171 */
172 memcpy(T, A, sizeof(T));
173
174 A[0][0] = T[0][0];
175 A[0][1] = T[1][1];
176 A[0][2] = T[2][2];
177 A[0][3] = T[3][3];
178 A[0][4] = T[4][4];
179
180 A[1][0] = T[0][3];
181 A[1][1] = T[1][4];
182 A[1][2] = T[2][0];
183 A[1][3] = T[3][1];
184 A[1][4] = T[4][2];
185
186 A[2][0] = T[0][1];
187 A[2][1] = T[1][2];
188 A[2][2] = T[2][3];
189 A[2][3] = T[3][4];
190 A[2][4] = T[4][0];
191
192 A[3][0] = T[0][4];
193 A[3][1] = T[1][0];
194 A[3][2] = T[2][1];
195 A[3][3] = T[3][2];
196 A[3][4] = T[4][3];
197
198 A[4][0] = T[0][2];
199 A[4][1] = T[1][3];
200 A[4][2] = T[2][4];
201 A[4][3] = T[3][0];
202 A[4][4] = T[4][1];
203}
204
205static void Chi(uint64_t A[5][5])
206{
207 uint64_t C[5];
208 size_t y;
209
210 for (y = 0; y < 5; y++) {
211 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
212 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
213 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
214 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
215 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
216
217 A[y][0] = C[0];
218 A[y][1] = C[1];
219 A[y][2] = C[2];
220 A[y][3] = C[3];
221 A[y][4] = C[4];
222 }
223}
224
225static void Iota(uint64_t A[5][5], size_t i)
226{
b9feae1b
AP
227 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
228 A[0][0] ^= iotas[i];
229}
230
b4f2a462 231static void KeccakF1600(uint64_t A[5][5])
b9feae1b
AP
232{
233 size_t i;
234
235 for (i = 0; i < 24; i++) {
236 Theta(A);
237 Rho(A);
238 Pi(A);
239 Chi(A);
240 Iota(A, i);
241 }
242}
243
79dfc3dd
AP
244#elif defined(KECCAK_1X)
245/*
246 * This implementation is optimization of above code featuring unroll
247 * of even y-loops, their fusion and code motion. It also minimizes
248 * temporary storage. Compiler would normally do all these things for
249 * you, purpose of manual optimization is to provide "unobscured"
250 * reference for assembly implementation [in case this approach is
251 * chosen for implementation on some platform]. In the nutshell it's
252 * equivalent of "plane-per-plane processing" approach discussed in
253 * section 2.4 of "Keccak implementation overview".
254 */
255static void Round(uint64_t A[5][5], size_t i)
256{
c83a4db5
AP
257 uint64_t C[5], E[2]; /* registers */
258 uint64_t D[5], T[2][5]; /* memory */
79dfc3dd
AP
259
260 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
261
262 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
263 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
264 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
265 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
266 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
267
c83a4db5
AP
268#if defined(__arm__)
269 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
270 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
271 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
272 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
273 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
274
275 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
276 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
277 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
278 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
279 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
280
281 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
282 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
283 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
284 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
285 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
286#else
79dfc3dd
AP
287 D[0] = ROL64(C[1], 1) ^ C[4];
288 D[1] = ROL64(C[2], 1) ^ C[0];
289 D[2] = ROL64(C[3], 1) ^ C[1];
290 D[3] = ROL64(C[4], 1) ^ C[2];
291 D[4] = ROL64(C[0], 1) ^ C[3];
292
79dfc3dd
AP
293 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
294 T[0][1] = A[0][1] ^ D[1];
295 T[0][2] = A[0][2] ^ D[2];
296 T[0][3] = A[0][3] ^ D[3];
297 T[0][4] = A[0][4] ^ D[4];
298
c83a4db5
AP
299 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
300 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
301 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
302 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
303 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
304#endif
79dfc3dd
AP
305 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
306 A[0][1] = C[1] ^ (~C[2] & C[3]);
307 A[0][2] = C[2] ^ (~C[3] & C[4]);
308 A[0][3] = C[3] ^ (~C[4] & C[0]);
309 A[0][4] = C[4] ^ (~C[0] & C[1]);
310
c83a4db5
AP
311 T[1][0] = A[1][0] ^ (C[3] = D[0]);
312 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
313 T[1][2] = A[1][2] ^ (E[0] = D[2]);
314 T[1][3] = A[1][3] ^ (E[1] = D[3]);
315 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
79dfc3dd 316
c83a4db5
AP
317 C[0] = ROL64(T[0][3], rhotates[0][3]);
318 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
319 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
320 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
321 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
79dfc3dd
AP
322
323 A[1][0] = C[0] ^ (~C[1] & C[2]);
324 A[1][1] = C[1] ^ (~C[2] & C[3]);
325 A[1][2] = C[2] ^ (~C[3] & C[4]);
326 A[1][3] = C[3] ^ (~C[4] & C[0]);
327 A[1][4] = C[4] ^ (~C[0] & C[1]);
328
329 C[0] = ROL64(T[0][1], rhotates[0][1]);
330 C[1] = ROL64(T[1][2], rhotates[1][2]);
331 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
332 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
333 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
334
335 A[2][0] = C[0] ^ (~C[1] & C[2]);
336 A[2][1] = C[1] ^ (~C[2] & C[3]);
337 A[2][2] = C[2] ^ (~C[3] & C[4]);
338 A[2][3] = C[3] ^ (~C[4] & C[0]);
339 A[2][4] = C[4] ^ (~C[0] & C[1]);
340
341 C[0] = ROL64(T[0][4], rhotates[0][4]);
342 C[1] = ROL64(T[1][0], rhotates[1][0]);
343 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
344 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
345 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
346
347 A[3][0] = C[0] ^ (~C[1] & C[2]);
348 A[3][1] = C[1] ^ (~C[2] & C[3]);
349 A[3][2] = C[2] ^ (~C[3] & C[4]);
350 A[3][3] = C[3] ^ (~C[4] & C[0]);
351 A[3][4] = C[4] ^ (~C[0] & C[1]);
352
353 C[0] = ROL64(T[0][2], rhotates[0][2]);
354 C[1] = ROL64(T[1][3], rhotates[1][3]);
355 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
356 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
357 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
358
359 A[4][0] = C[0] ^ (~C[1] & C[2]);
360 A[4][1] = C[1] ^ (~C[2] & C[3]);
361 A[4][2] = C[2] ^ (~C[3] & C[4]);
362 A[4][3] = C[3] ^ (~C[4] & C[0]);
363 A[4][4] = C[4] ^ (~C[0] & C[1]);
364}
365
b4f2a462 366static void KeccakF1600(uint64_t A[5][5])
79dfc3dd
AP
367{
368 size_t i;
369
370 for (i = 0; i < 24; i++) {
371 Round(A, i);
372 }
373}
374
1ded2dd3
AP
375#elif defined(KECCAK_1X_ALT)
376/*
22f9fa6e
AP
377 * This is variant of above KECCAK_1X that reduces requirement for
378 * temporary storage even further, but at cost of more updates to A[][].
379 * It's less suitable if A[][] is memory bound, but better if it's
1ded2dd3
AP
380 * register bound.
381 */
382
383static void Round(uint64_t A[5][5], size_t i)
384{
385 uint64_t C[5], D[5];
386
387 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
388
389 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
390 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
391 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
392 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
393 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
394
22f9fa6e
AP
395 D[1] = C[0] ^ ROL64(C[2], 1);
396 D[2] = C[1] ^ ROL64(C[3], 1);
397 D[3] = C[2] ^= ROL64(C[4], 1);
398 D[4] = C[3] ^= ROL64(C[0], 1);
399 D[0] = C[4] ^= ROL64(C[1], 1);
1ded2dd3 400
1ded2dd3
AP
401 A[0][1] ^= D[1];
402 A[1][1] ^= D[1];
403 A[2][1] ^= D[1];
404 A[3][1] ^= D[1];
405 A[4][1] ^= D[1];
406
1ded2dd3
AP
407 A[0][2] ^= D[2];
408 A[1][2] ^= D[2];
409 A[2][2] ^= D[2];
410 A[3][2] ^= D[2];
411 A[4][2] ^= D[2];
412
22f9fa6e
AP
413 A[0][3] ^= C[2];
414 A[1][3] ^= C[2];
415 A[2][3] ^= C[2];
416 A[3][3] ^= C[2];
417 A[4][3] ^= C[2];
1ded2dd3 418
22f9fa6e
AP
419 A[0][4] ^= C[3];
420 A[1][4] ^= C[3];
421 A[2][4] ^= C[3];
422 A[3][4] ^= C[3];
423 A[4][4] ^= C[3];
424
425 A[0][0] ^= C[4];
426 A[1][0] ^= C[4];
427 A[2][0] ^= C[4];
428 A[3][0] ^= C[4];
429 A[4][0] ^= C[4];
1ded2dd3
AP
430
431 C[1] = A[0][1];
432 C[2] = A[0][2];
433 C[3] = A[0][3];
434 C[4] = A[0][4];
435
436 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
437 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
438 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
439 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
440
441 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
442 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
443 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
444 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
445
446 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
447 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
448 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
449 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
450
451 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
452 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
453 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
454 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
455
456 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
457 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
458 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
459 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
460
461 A[1][0] = ROL64(C[3], rhotates[0][3]);
462 A[2][0] = ROL64(C[1], rhotates[0][1]);
463 A[3][0] = ROL64(C[4], rhotates[0][4]);
464 A[4][0] = ROL64(C[2], rhotates[0][2]);
465
466 C[0] = A[0][0];
467 C[1] = A[1][0];
1ded2dd3
AP
468 D[0] = A[0][1];
469 D[1] = A[1][1];
1ded2dd3
AP
470
471 A[0][0] ^= (~A[0][1] & A[0][2]);
472 A[1][0] ^= (~A[1][1] & A[1][2]);
1ded2dd3
AP
473 A[0][1] ^= (~A[0][2] & A[0][3]);
474 A[1][1] ^= (~A[1][2] & A[1][3]);
1ded2dd3
AP
475 A[0][2] ^= (~A[0][3] & A[0][4]);
476 A[1][2] ^= (~A[1][3] & A[1][4]);
1ded2dd3
AP
477 A[0][3] ^= (~A[0][4] & C[0]);
478 A[1][3] ^= (~A[1][4] & C[1]);
1ded2dd3
AP
479 A[0][4] ^= (~C[0] & D[0]);
480 A[1][4] ^= (~C[1] & D[1]);
22f9fa6e
AP
481
482 C[2] = A[2][0];
483 C[3] = A[3][0];
484 D[2] = A[2][1];
485 D[3] = A[3][1];
486
487 A[2][0] ^= (~A[2][1] & A[2][2]);
488 A[3][0] ^= (~A[3][1] & A[3][2]);
489 A[2][1] ^= (~A[2][2] & A[2][3]);
490 A[3][1] ^= (~A[3][2] & A[3][3]);
491 A[2][2] ^= (~A[2][3] & A[2][4]);
492 A[3][2] ^= (~A[3][3] & A[3][4]);
493 A[2][3] ^= (~A[2][4] & C[2]);
494 A[3][3] ^= (~A[3][4] & C[3]);
1ded2dd3
AP
495 A[2][4] ^= (~C[2] & D[2]);
496 A[3][4] ^= (~C[3] & D[3]);
1ded2dd3 497
22f9fa6e
AP
498 C[4] = A[4][0];
499 D[4] = A[4][1];
500
501 A[4][0] ^= (~A[4][1] & A[4][2]);
502 A[4][1] ^= (~A[4][2] & A[4][3]);
503 A[4][2] ^= (~A[4][3] & A[4][4]);
504 A[4][3] ^= (~A[4][4] & C[4]);
505 A[4][4] ^= (~C[4] & D[4]);
1ded2dd3
AP
506 A[0][0] ^= iotas[i];
507}
508
b4f2a462 509static void KeccakF1600(uint64_t A[5][5])
1ded2dd3
AP
510{
511 size_t i;
512
513 for (i = 0; i < 24; i++) {
514 Round(A, i);
515 }
516}
517
79dfc3dd
AP
518#elif defined(KECCAK_2X)
519/*
520 * This implementation is variant of KECCAK_1X above with outer-most
521 * round loop unrolled twice. This allows to take temporary storage
522 * out of round procedure and simplify references to it by alternating
5d010e3f
AP
523 * it with actual data (see round loop below). Originally it was meant
524 * rather as reference for an assembly implementation, but it seems to
525 * play best with compilers [as well as provide best instruction per
526 * processed byte ratio at minimal round unroll factor]...
79dfc3dd
AP
527 */
528static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
529{
530 uint64_t C[5], D[5];
79dfc3dd
AP
531
532 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
533
534 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
535 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
536 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
537 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
538 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
539
540 D[0] = ROL64(C[1], 1) ^ C[4];
541 D[1] = ROL64(C[2], 1) ^ C[0];
542 D[2] = ROL64(C[3], 1) ^ C[1];
543 D[3] = ROL64(C[4], 1) ^ C[2];
544 D[4] = ROL64(C[0], 1) ^ C[3];
545
546 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
547 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
548 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
549 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
550 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
551
1f2aff25
AP
552#ifdef KECCAK_COMPLEMENTING_TRANSFORM
553 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
554 R[0][1] = C[1] ^ (~C[2] | C[3]);
555 R[0][2] = C[2] ^ ( C[3] & C[4]);
556 R[0][3] = C[3] ^ ( C[4] | C[0]);
557 R[0][4] = C[4] ^ ( C[0] & C[1]);
558#else
79dfc3dd
AP
559 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
560 R[0][1] = C[1] ^ (~C[2] & C[3]);
561 R[0][2] = C[2] ^ (~C[3] & C[4]);
562 R[0][3] = C[3] ^ (~C[4] & C[0]);
563 R[0][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 564#endif
79dfc3dd
AP
565
566 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
567 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
568 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
569 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
570 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
571
1f2aff25
AP
572#ifdef KECCAK_COMPLEMENTING_TRANSFORM
573 R[1][0] = C[0] ^ (C[1] | C[2]);
574 R[1][1] = C[1] ^ (C[2] & C[3]);
575 R[1][2] = C[2] ^ (C[3] | ~C[4]);
576 R[1][3] = C[3] ^ (C[4] | C[0]);
577 R[1][4] = C[4] ^ (C[0] & C[1]);
578#else
79dfc3dd
AP
579 R[1][0] = C[0] ^ (~C[1] & C[2]);
580 R[1][1] = C[1] ^ (~C[2] & C[3]);
581 R[1][2] = C[2] ^ (~C[3] & C[4]);
582 R[1][3] = C[3] ^ (~C[4] & C[0]);
583 R[1][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 584#endif
79dfc3dd
AP
585
586 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
587 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
588 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
589 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
590 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
591
1f2aff25
AP
592#ifdef KECCAK_COMPLEMENTING_TRANSFORM
593 R[2][0] = C[0] ^ ( C[1] | C[2]);
594 R[2][1] = C[1] ^ ( C[2] & C[3]);
595 R[2][2] = C[2] ^ (~C[3] & C[4]);
596 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
597 R[2][4] = C[4] ^ ( C[0] & C[1]);
598#else
79dfc3dd
AP
599 R[2][0] = C[0] ^ (~C[1] & C[2]);
600 R[2][1] = C[1] ^ (~C[2] & C[3]);
601 R[2][2] = C[2] ^ (~C[3] & C[4]);
602 R[2][3] = C[3] ^ (~C[4] & C[0]);
603 R[2][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 604#endif
79dfc3dd
AP
605
606 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
607 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
608 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
609 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
610 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
611
1f2aff25
AP
612#ifdef KECCAK_COMPLEMENTING_TRANSFORM
613 R[3][0] = C[0] ^ ( C[1] & C[2]);
614 R[3][1] = C[1] ^ ( C[2] | C[3]);
615 R[3][2] = C[2] ^ (~C[3] | C[4]);
616 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
617 R[3][4] = C[4] ^ ( C[0] | C[1]);
618#else
79dfc3dd
AP
619 R[3][0] = C[0] ^ (~C[1] & C[2]);
620 R[3][1] = C[1] ^ (~C[2] & C[3]);
621 R[3][2] = C[2] ^ (~C[3] & C[4]);
622 R[3][3] = C[3] ^ (~C[4] & C[0]);
623 R[3][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 624#endif
79dfc3dd
AP
625
626 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
628 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
629 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
630 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
631
1f2aff25
AP
632#ifdef KECCAK_COMPLEMENTING_TRANSFORM
633 R[4][0] = C[0] ^ (~C[1] & C[2]);
634 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
635 R[4][2] = C[2] ^ ( C[3] & C[4]);
636 R[4][3] = C[3] ^ ( C[4] | C[0]);
637 R[4][4] = C[4] ^ ( C[0] & C[1]);
638#else
79dfc3dd
AP
639 R[4][0] = C[0] ^ (~C[1] & C[2]);
640 R[4][1] = C[1] ^ (~C[2] & C[3]);
641 R[4][2] = C[2] ^ (~C[3] & C[4]);
642 R[4][3] = C[3] ^ (~C[4] & C[0]);
643 R[4][4] = C[4] ^ (~C[0] & C[1]);
1f2aff25 644#endif
79dfc3dd
AP
645}
646
b4f2a462 647static void KeccakF1600(uint64_t A[5][5])
79dfc3dd
AP
648{
649 uint64_t T[5][5];
650 size_t i;
651
1f2aff25
AP
652#ifdef KECCAK_COMPLEMENTING_TRANSFORM
653 A[0][1] = ~A[0][1];
654 A[0][2] = ~A[0][2];
655 A[1][3] = ~A[1][3];
656 A[2][2] = ~A[2][2];
657 A[3][2] = ~A[3][2];
658 A[4][0] = ~A[4][0];
659#endif
660
79dfc3dd
AP
661 for (i = 0; i < 24; i += 2) {
662 Round(T, A, i);
663 Round(A, T, i + 1);
664 }
1f2aff25
AP
665
666#ifdef KECCAK_COMPLEMENTING_TRANSFORM
667 A[0][1] = ~A[0][1];
668 A[0][2] = ~A[0][2];
669 A[1][3] = ~A[1][3];
670 A[2][2] = ~A[2][2];
671 A[3][2] = ~A[3][2];
672 A[4][0] = ~A[4][0];
673#endif
79dfc3dd
AP
674}
675
5d010e3f 676#else /* define KECCAK_INPLACE to compile this code path */
79dfc3dd
AP
677/*
678 * This implementation is KECCAK_1X from above combined 4 times with
679 * a twist that allows to omit temporary storage and perform in-place
680 * processing. It's discussed in section 2.5 of "Keccak implementation
681 * overview". It's likely to be best suited for processors with large
5d010e3f
AP
682 * register bank... On the other hand processor with large register
683 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
684 * more compact...
79dfc3dd
AP
685 */
686static void FourRounds(uint64_t A[5][5], size_t i)
687{
688 uint64_t B[5], C[5], D[5];
79dfc3dd
AP
689
690 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
691
692 /* Round 4*n */
693 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
694 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
695 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
696 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
697 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
698
699 D[0] = ROL64(C[1], 1) ^ C[4];
700 D[1] = ROL64(C[2], 1) ^ C[0];
701 D[2] = ROL64(C[3], 1) ^ C[1];
702 D[3] = ROL64(C[4], 1) ^ C[2];
703 D[4] = ROL64(C[0], 1) ^ C[3];
704
705 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
706 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
707 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
708 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
709 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
710
711 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
712 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
713 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
714 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
715 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
716
717 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
718 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
719 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
720 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
721 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
722
723 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
724 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
725 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
726 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
727 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
728
729 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
730 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
731 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
732 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
733 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
734
735 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
736 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
737 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
738 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
739 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
740
741 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
742 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
743 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
744 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
745 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
746
747 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
748 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
749 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
750 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
751 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
752
753 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
754 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
755 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
756 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
757 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
758
759 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
760 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
761 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
762 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
763 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
764
765 /* Round 4*n+1 */
766 D[0] = ROL64(C[1], 1) ^ C[4];
767 D[1] = ROL64(C[2], 1) ^ C[0];
768 D[2] = ROL64(C[3], 1) ^ C[1];
769 D[3] = ROL64(C[4], 1) ^ C[2];
770 D[4] = ROL64(C[0], 1) ^ C[3];
771
772 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
773 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
774 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
775 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
776 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
777
778 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
779 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
780 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
781 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
782 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
783
784 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
785 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
786 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
787 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
788 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
789
790 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
791 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
792 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
793 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
794 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
795
796 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
797 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
798 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
799 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
800 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
801
802 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
803 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
804 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
805 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
806 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
807
808 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
809 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
810 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
811 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
812 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
813
814 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
815 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
816 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
817 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
818 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
819
820 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
821 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
822 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
823 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
824 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
825
826 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
827 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
828 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
829 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
830 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
831
832 /* Round 4*n+2 */
833 D[0] = ROL64(C[1], 1) ^ C[4];
834 D[1] = ROL64(C[2], 1) ^ C[0];
835 D[2] = ROL64(C[3], 1) ^ C[1];
836 D[3] = ROL64(C[4], 1) ^ C[2];
837 D[4] = ROL64(C[0], 1) ^ C[3];
838
839 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
840 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
841 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
842 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
843 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
844
845 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
846 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
847 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
848 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
849 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
850
851 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
852 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
853 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
854 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
855 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
856
857 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
858 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
859 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
860 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
861 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
862
863 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
864 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
865 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
866 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
867 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
868
869 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
870 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
871 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
872 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
873 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
874
875 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
876 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
877 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
878 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
879 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
880
881 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
882 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
883 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
884 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
885 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
886
887 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
888 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
889 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
890 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
891 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
892
893 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
894 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
895 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
896 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
897 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
898
899 /* Round 4*n+3 */
900 D[0] = ROL64(C[1], 1) ^ C[4];
901 D[1] = ROL64(C[2], 1) ^ C[0];
902 D[2] = ROL64(C[3], 1) ^ C[1];
903 D[3] = ROL64(C[4], 1) ^ C[2];
904 D[4] = ROL64(C[0], 1) ^ C[3];
905
906 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
907 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
908 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
909 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
910 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
911
912 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
913 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
914 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
915 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
916 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
917
918 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
919 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
920 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
921 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
922 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
923
924 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
925 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
926 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
927 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
928 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
929
930 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
931 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
932 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
933 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
934 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
935
936 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
937 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
938 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
939 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
940 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
941
942 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
943 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
944 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
945 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
946 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
947
948 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
949 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
950 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
951 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
952 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
953
954 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
955 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
956 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
957 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
958 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
959
960 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
961 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
962 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
963 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
964 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
965}
966
b4f2a462 967static void KeccakF1600(uint64_t A[5][5])
79dfc3dd
AP
968{
969 size_t i;
970
971 for (i = 0; i < 24; i += 4) {
972 FourRounds(A, i);
973 }
974}
975
976#endif
977
0dd0be94
AP
978static uint64_t BitInterleave(uint64_t Ai)
979{
13603583
AP
980 if (BIT_INTERLEAVE) {
981 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
982 uint32_t t0, t1;
983
984 t0 = lo & 0x55555555;
985 t0 |= t0 >> 1; t0 &= 0x33333333;
986 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
987 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
988 t0 |= t0 >> 8; t0 &= 0x0000ffff;
989
990 t1 = hi & 0x55555555;
991 t1 |= t1 >> 1; t1 &= 0x33333333;
992 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
993 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
994 t1 |= t1 >> 8; t1 <<= 16;
995
996 lo &= 0xaaaaaaaa;
997 lo |= lo << 1; lo &= 0xcccccccc;
998 lo |= lo << 2; lo &= 0xf0f0f0f0;
999 lo |= lo << 4; lo &= 0xff00ff00;
1000 lo |= lo << 8; lo >>= 16;
1001
1002 hi &= 0xaaaaaaaa;
1003 hi |= hi << 1; hi &= 0xcccccccc;
1004 hi |= hi << 2; hi &= 0xf0f0f0f0;
1005 hi |= hi << 4; hi &= 0xff00ff00;
1006 hi |= hi << 8; hi &= 0xffff0000;
1007
1008 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
0dd0be94
AP
1009 }
1010
1011 return Ai;
1012}
1013
1014static uint64_t BitDeinterleave(uint64_t Ai)
1015{
13603583 1016 if (BIT_INTERLEAVE) {
0dd0be94 1017 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
13603583
AP
1018 uint32_t t0, t1;
1019
1020 t0 = lo & 0x0000ffff;
1021 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1022 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1023 t0 |= t0 << 2; t0 &= 0x33333333;
1024 t0 |= t0 << 1; t0 &= 0x55555555;
1025
1026 t1 = hi << 16;
1027 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1028 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1029 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1030 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1031
1032 lo >>= 16;
1033 lo |= lo << 8; lo &= 0x00ff00ff;
1034 lo |= lo << 4; lo &= 0x0f0f0f0f;
1035 lo |= lo << 2; lo &= 0x33333333;
1036 lo |= lo << 1; lo &= 0x55555555;
1037
1038 hi &= 0xffff0000;
1039 hi |= hi >> 8; hi &= 0xff00ff00;
1040 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1041 hi |= hi >> 2; hi &= 0xcccccccc;
1042 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1043
1044 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
0dd0be94
AP
1045 }
1046
1047 return Ai;
1048}
1049
b9feae1b
AP
1050/*
1051 * SHA3_absorb can be called multiple times, but at each invocation
1052 * largest multiple of |r| out of |len| bytes are processed. Then
c83a4db5
AP
1053 * remaining amount of bytes is returned. This is done to spare caller
1054 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1055 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1056 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
b9feae1b 1057 * padding and intermediate sub-block buffering, byte- or bitwise, is
46f4e1be 1058 * caller's responsibility.
b9feae1b
AP
1059 */
1060size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1061 size_t r)
1062{
1063 uint64_t *A_flat = (uint64_t *)A;
1064 size_t i, w = r / 8;
1065
4b904301
AP
1066 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1067
b9feae1b
AP
1068 while (len >= r) {
1069 for (i = 0; i < w; i++) {
0dd0be94
AP
1070 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1071 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1072 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1073 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
b9feae1b 1074 inp += 8;
0dd0be94
AP
1075
1076 A_flat[i] ^= BitInterleave(Ai);
b9feae1b
AP
1077 }
1078 KeccakF1600(A);
1079 len -= r;
1080 }
1081
1082 return len;
1083}
1084
1085/*
1086 * SHA3_squeeze is called once at the end to generate |out| hash value
1087 * of |len| bytes.
1088 */
1089void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1090{
1091 uint64_t *A_flat = (uint64_t *)A;
b4f2a462 1092 size_t i, w = r / 8;
b9feae1b 1093
4b904301
AP
1094 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1095
b4f2a462
AP
1096 while (len != 0) {
1097 for (i = 0; i < w && len != 0; i++) {
0dd0be94 1098 uint64_t Ai = BitDeinterleave(A_flat[i]);
b9feae1b 1099
b4f2a462
AP
1100 if (len < 8) {
1101 for (i = 0; i < len; i++) {
1102 *out++ = (unsigned char)Ai;
1103 Ai >>= 8;
1104 }
1105 return;
1106 }
1107
b9feae1b
AP
1108 out[0] = (unsigned char)(Ai);
1109 out[1] = (unsigned char)(Ai >> 8);
1110 out[2] = (unsigned char)(Ai >> 16);
1111 out[3] = (unsigned char)(Ai >> 24);
1112 out[4] = (unsigned char)(Ai >> 32);
1113 out[5] = (unsigned char)(Ai >> 40);
1114 out[6] = (unsigned char)(Ai >> 48);
1115 out[7] = (unsigned char)(Ai >> 56);
1116 out += 8;
b4f2a462 1117 len -= 8;
b9feae1b 1118 }
b9feae1b
AP
1119 if (len)
1120 KeccakF1600(A);
1121 }
b9feae1b 1122}
71dd3b64 1123#endif
b9feae1b
AP
1124
1125#ifdef SELFTEST
1126/*
1127 * Post-padding one-shot implementations would look as following:
1128 *
1129 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1130 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1131 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1132 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1133 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1134 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1135 */
1136
1137void SHA3_sponge(const unsigned char *inp, size_t len,
1138 unsigned char *out, size_t d, size_t r)
1139{
1140 uint64_t A[5][5];
1141
1142 memset(A, 0, sizeof(A));
1143 SHA3_absorb(A, inp, len, r);
1144 SHA3_squeeze(A, out, d, r);
1145}
1146
1147# include <stdio.h>
1148
1149int main()
1150{
c3086f46
AP
1151 /*
1152 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1153 */
b9feae1b
AP
1154 unsigned char test[168] = { '\xf3', '\x3' };
1155 unsigned char out[512];
1156 size_t i;
c3086f46
AP
1157 static const unsigned char result[512] = {
1158 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1159 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1160 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1161 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1162 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1163 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1164 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1165 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1166 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1167 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1168 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1169 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1170 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1171 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1172 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1173 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1174 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1175 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1176 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1177 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1178 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1179 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1180 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1181 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1182 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1183 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1184 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1185 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1186 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1187 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1188 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1189 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1190 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1191 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1192 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1193 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1194 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1195 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1196 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1197 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1198 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1199 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1200 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1201 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1202 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1203 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1204 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1205 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1206 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1207 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1208 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1209 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1210 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1211 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1212 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1213 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1214 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1215 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1216 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1217 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1218 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1219 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1220 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1221 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1222 };
b9feae1b 1223
b9feae1b
AP
1224 test[167] = '\x80';
1225 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1226
c3086f46
AP
1227 /*
1228 * Rationale behind keeping output [formatted as below] is that
1229 * one should be able to redirect it to a file, then copy-n-paste
1230 * final "output val" from official example to another file, and
1231 * compare the two with diff(1).
1232 */
b9feae1b
AP
1233 for (i = 0; i < sizeof(out);) {
1234 printf("%02X", out[i]);
1235 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1236 }
c3086f46
AP
1237
1238 if (memcmp(out,result,sizeof(out))) {
1239 fprintf(stderr,"failure\n");
1240 return 1;
1241 } else {
1242 fprintf(stderr,"success\n");
1243 return 0;
1244 }
b9feae1b
AP
1245}
1246#endif