]> git.ipfire.org Git - people/ms/strongswan.git/blob - src/libstrongswan/plugins/aesni/aesni_gcm.c
aesni: Avoid loading AES/GHASH round keys into local variables
[people/ms/strongswan.git] / src / libstrongswan / plugins / aesni / aesni_gcm.c
1 /*
2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the
7 * Free Software Foundation; either version 2 of the License, or (at your
8 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * for more details.
14 */
15
16 #include "aesni_gcm.h"
17 #include "aesni_key.h"
18
19 #include <crypto/iv/iv_gen_seq.h>
20
21 #include <tmmintrin.h>
22
23 #define NONCE_SIZE 12
24 #define IV_SIZE 8
25 #define SALT_SIZE (NONCE_SIZE - IV_SIZE)
26
27 /**
28 * Parallel pipelining
29 */
30 #define GCM_CRYPT_PARALLELISM 4
31
32 typedef struct private_aesni_gcm_t private_aesni_gcm_t;
33
34 /**
35 * GCM en/decryption method type
36 */
37 typedef void (*aesni_gcm_fn_t)(private_aesni_gcm_t*, size_t, u_char*, u_char*,
38 u_char*, size_t, u_char*, u_char*);
39
40 /**
41 * Private data of an aesni_gcm_t object.
42 */
43 struct private_aesni_gcm_t {
44
45 /**
46 * Public aesni_gcm_t interface.
47 */
48 aesni_gcm_t public;
49
50 /**
51 * Encryption key schedule
52 */
53 aesni_key_t *key;
54
55 /**
56 * IV generator.
57 */
58 iv_gen_t *iv_gen;
59
60 /**
61 * Length of the integrity check value
62 */
63 size_t icv_size;
64
65 /**
66 * Length of the key in bytes
67 */
68 size_t key_size;
69
70 /**
71 * GCM encryption function
72 */
73 aesni_gcm_fn_t encrypt;
74
75 /**
76 * GCM decryption function
77 */
78 aesni_gcm_fn_t decrypt;
79
80 /**
81 * salt to add to nonce
82 */
83 u_char salt[SALT_SIZE];
84
85 /**
86 * GHASH subkey H, big-endian
87 */
88 __m128i h;
89
90 /**
91 * GHASH key H^2, big-endian
92 */
93 __m128i hh;
94
95 /**
96 * GHASH key H^3, big-endian
97 */
98 __m128i hhh;
99
100 /**
101 * GHASH key H^4, big-endian
102 */
103 __m128i hhhh;
104 };
105
106 /**
107 * Byte-swap a 128-bit integer
108 */
109 static inline __m128i swap128(__m128i x)
110 {
111 return _mm_shuffle_epi8(x,
112 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
113 }
114
115 /**
116 * Multiply two blocks in GF128
117 */
118 static __m128i mult_block(__m128i h, __m128i y)
119 {
120 __m128i t1, t2, t3, t4, t5, t6;
121
122 y = swap128(y);
123
124 t1 = _mm_clmulepi64_si128(h, y, 0x00);
125 t2 = _mm_clmulepi64_si128(h, y, 0x01);
126 t3 = _mm_clmulepi64_si128(h, y, 0x10);
127 t4 = _mm_clmulepi64_si128(h, y, 0x11);
128
129 t2 = _mm_xor_si128(t2, t3);
130 t3 = _mm_slli_si128(t2, 8);
131 t2 = _mm_srli_si128(t2, 8);
132 t1 = _mm_xor_si128(t1, t3);
133 t4 = _mm_xor_si128(t4, t2);
134
135 t5 = _mm_srli_epi32(t1, 31);
136 t1 = _mm_slli_epi32(t1, 1);
137 t6 = _mm_srli_epi32(t4, 31);
138 t4 = _mm_slli_epi32(t4, 1);
139
140 t3 = _mm_srli_si128(t5, 12);
141 t6 = _mm_slli_si128(t6, 4);
142 t5 = _mm_slli_si128(t5, 4);
143 t1 = _mm_or_si128(t1, t5);
144 t4 = _mm_or_si128(t4, t6);
145 t4 = _mm_or_si128(t4, t3);
146
147 t5 = _mm_slli_epi32(t1, 31);
148 t6 = _mm_slli_epi32(t1, 30);
149 t3 = _mm_slli_epi32(t1, 25);
150
151 t5 = _mm_xor_si128(t5, t6);
152 t5 = _mm_xor_si128(t5, t3);
153 t6 = _mm_srli_si128(t5, 4);
154 t4 = _mm_xor_si128(t4, t6);
155 t5 = _mm_slli_si128(t5, 12);
156 t1 = _mm_xor_si128(t1, t5);
157 t4 = _mm_xor_si128(t4, t1);
158
159 t5 = _mm_srli_epi32(t1, 1);
160 t2 = _mm_srli_epi32(t1, 2);
161 t3 = _mm_srli_epi32(t1, 7);
162 t4 = _mm_xor_si128(t4, t2);
163 t4 = _mm_xor_si128(t4, t3);
164 t4 = _mm_xor_si128(t4, t5);
165
166 return swap128(t4);
167 }
168
169 /**
170 * Multiply four consecutive blocks by their respective GHASH key, XOR
171 */
172 static inline __m128i mult4xor(__m128i h1, __m128i h2, __m128i h3, __m128i h4,
173 __m128i d1, __m128i d2, __m128i d3, __m128i d4)
174 {
175 __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
176
177 d1 = swap128(d1);
178 d2 = swap128(d2);
179 d3 = swap128(d3);
180 d4 = swap128(d4);
181
182 t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
183 t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
184 t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
185 t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
186 t8 = _mm_xor_si128(t0, t1);
187 t8 = _mm_xor_si128(t8, t2);
188 t8 = _mm_xor_si128(t8, t3);
189
190 t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
191 t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
192 t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
193 t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
194 t9 = _mm_xor_si128(t4, t5);
195 t9 = _mm_xor_si128(t9, t6);
196 t9 = _mm_xor_si128(t9, t7);
197
198 t0 = _mm_shuffle_epi32(h1, 78);
199 t4 = _mm_shuffle_epi32(d1, 78);
200 t0 = _mm_xor_si128(t0, h1);
201 t4 = _mm_xor_si128(t4, d1);
202 t1 = _mm_shuffle_epi32(h2, 78);
203 t5 = _mm_shuffle_epi32(d2, 78);
204 t1 = _mm_xor_si128(t1, h2);
205 t5 = _mm_xor_si128(t5, d2);
206 t2 = _mm_shuffle_epi32(h3, 78);
207 t6 = _mm_shuffle_epi32(d3, 78);
208 t2 = _mm_xor_si128(t2, h3);
209 t6 = _mm_xor_si128(t6, d3);
210 t3 = _mm_shuffle_epi32(h4, 78);
211 t7 = _mm_shuffle_epi32(d4, 78);
212 t3 = _mm_xor_si128(t3, h4);
213 t7 = _mm_xor_si128(t7, d4);
214
215 t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
216 t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
217 t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
218 t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
219 t0 = _mm_xor_si128(t0, t8);
220 t0 = _mm_xor_si128(t0, t9);
221 t0 = _mm_xor_si128(t1, t0);
222 t0 = _mm_xor_si128(t2, t0);
223
224 t0 = _mm_xor_si128(t3, t0);
225 t4 = _mm_slli_si128(t0, 8);
226 t0 = _mm_srli_si128(t0, 8);
227 t3 = _mm_xor_si128(t4, t8);
228 t6 = _mm_xor_si128(t0, t9);
229 t7 = _mm_srli_epi32(t3, 31);
230 t8 = _mm_srli_epi32(t6, 31);
231 t3 = _mm_slli_epi32(t3, 1);
232 t6 = _mm_slli_epi32(t6, 1);
233 t9 = _mm_srli_si128(t7, 12);
234 t8 = _mm_slli_si128(t8, 4);
235 t7 = _mm_slli_si128(t7, 4);
236 t3 = _mm_or_si128(t3, t7);
237 t6 = _mm_or_si128(t6, t8);
238 t6 = _mm_or_si128(t6, t9);
239 t7 = _mm_slli_epi32(t3, 31);
240 t8 = _mm_slli_epi32(t3, 30);
241 t9 = _mm_slli_epi32(t3, 25);
242 t7 = _mm_xor_si128(t7, t8);
243 t7 = _mm_xor_si128(t7, t9);
244 t8 = _mm_srli_si128(t7, 4);
245 t7 = _mm_slli_si128(t7, 12);
246 t3 = _mm_xor_si128(t3, t7);
247 t2 = _mm_srli_epi32(t3, 1);
248 t4 = _mm_srli_epi32(t3, 2);
249 t5 = _mm_srli_epi32(t3, 7);
250 t2 = _mm_xor_si128(t2, t4);
251 t2 = _mm_xor_si128(t2, t5);
252 t2 = _mm_xor_si128(t2, t8);
253 t3 = _mm_xor_si128(t3, t2);
254 t6 = _mm_xor_si128(t6, t3);
255
256 return swap128(t6);
257 }
258
259 /**
260 * GHASH on a single block
261 */
262 static __m128i ghash(__m128i h, __m128i y, __m128i x)
263 {
264 return mult_block(h, _mm_xor_si128(y, x));
265 }
266
267 /**
268 * Start constructing the ICV for the associated data
269 */
270 static __m128i icv_header(private_aesni_gcm_t *this, void *assoc, size_t alen)
271 {
272 u_int blocks, pblocks, rem, i;
273 __m128i h1, h2, h3, h4, d1, d2, d3, d4;
274 __m128i y, last, *ab;
275
276 h1 = this->hhhh;
277 h2 = this->hhh;
278 h3 = this->hh;
279 h4 = this->h;
280
281 y = _mm_setzero_si128();
282 ab = assoc;
283 blocks = alen / AES_BLOCK_SIZE;
284 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
285 rem = alen % AES_BLOCK_SIZE;
286 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
287 {
288 d1 = _mm_loadu_si128(ab + i + 0);
289 d2 = _mm_loadu_si128(ab + i + 1);
290 d3 = _mm_loadu_si128(ab + i + 2);
291 d4 = _mm_loadu_si128(ab + i + 3);
292 y = _mm_xor_si128(y, d1);
293 y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
294 }
295 for (i = pblocks; i < blocks; i++)
296 {
297 y = ghash(this->h, y, _mm_loadu_si128(ab + i));
298 }
299 if (rem)
300 {
301 last = _mm_setzero_si128();
302 memcpy(&last, ab + blocks, rem);
303
304 y = ghash(this->h, y, last);
305 }
306
307 return y;
308 }
309
310 /**
311 * Complete the ICV by hashing a assoc/data length block
312 */
313 static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y,
314 size_t alen, size_t dlen)
315 {
316 __m128i b;
317
318 htoun64(&b, alen * 8);
319 htoun64((u_char*)&b + sizeof(u_int64_t), dlen * 8);
320
321 return ghash(this->h, y, b);
322 }
323
324 /**
325 * En-/Decrypt the ICV, trim and store it
326 */
327 static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
328 u_char *icv)
329 {
330 __m128i *ks, t, b;
331 u_int round;
332
333 ks = this->key->schedule;
334 t = _mm_xor_si128(j, ks[0]);
335 for (round = 1; round < this->key->rounds; round++)
336 {
337 t = _mm_aesenc_si128(t, ks[round]);
338 }
339 t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
340
341 t = _mm_xor_si128(y, t);
342
343 _mm_storeu_si128(&b, t);
344 memcpy(icv, &b, this->icv_size);
345 }
346
347 /**
348 * Do big-endian increment on x
349 */
350 static inline __m128i increment_be(__m128i x)
351 {
352 x = swap128(x);
353 x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
354 x = swap128(x);
355
356 return x;
357 }
358
359 /**
360 * Generate the block J0
361 */
362 static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv)
363 {
364 u_char j[AES_BLOCK_SIZE];
365
366 memcpy(j, this->salt, SALT_SIZE);
367 memcpy(j + SALT_SIZE, iv, IV_SIZE);
368 htoun32(j + SALT_SIZE + IV_SIZE, 1);
369
370 return _mm_loadu_si128((__m128i*)j);
371 }
372
373 /**
374 * Encrypt a remaining incomplete block, return updated Y
375 */
376 static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
377 void *in, void *out, __m128i cb, __m128i y)
378 {
379 __m128i *ks, t, b;
380 u_int round;
381
382 memset(&b, 0, sizeof(b));
383 memcpy(&b, in, rem);
384
385 ks = this->key->schedule;
386 t = _mm_xor_si128(cb, ks[0]);
387 for (round = 1; round < this->key->rounds; round++)
388 {
389 t = _mm_aesenc_si128(t, ks[round]);
390 }
391 t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
392 b = _mm_xor_si128(t, b);
393
394 memcpy(out, &b, rem);
395
396 memset((u_char*)&b + rem, 0, AES_BLOCK_SIZE - rem);
397 return ghash(this->h, y, b);
398 }
399
400 /**
401 * Decrypt a remaining incomplete block, return updated Y
402 */
403 static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
404 void *in, void *out, __m128i cb, __m128i y)
405 {
406 __m128i *ks, t, b;
407 u_int round;
408
409 memset(&b, 0, sizeof(b));
410 memcpy(&b, in, rem);
411
412 y = ghash(this->h, y, b);
413
414 ks = this->key->schedule;
415 t = _mm_xor_si128(cb, ks[0]);
416 for (round = 1; round < this->key->rounds; round++)
417 {
418 t = _mm_aesenc_si128(t, ks[round]);
419 }
420 t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
421 b = _mm_xor_si128(t, b);
422
423 memcpy(out, &b, rem);
424
425 return y;
426 }
427
428 /**
429 * AES-128 GCM encryption/ICV generation
430 */
431 static void encrypt_gcm128(private_aesni_gcm_t *this,
432 size_t len, u_char *in, u_char *out, u_char *iv,
433 size_t alen, u_char *assoc, u_char *icv)
434 {
435 __m128i d1, d2, d3, d4, t1, t2, t3, t4;
436 __m128i *ks, y, j, cb, *bi, *bo;
437 u_int blocks, pblocks, rem, i;
438
439 j = create_j(this, iv);
440 cb = increment_be(j);
441 y = icv_header(this, assoc, alen);
442 blocks = len / AES_BLOCK_SIZE;
443 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
444 rem = len % AES_BLOCK_SIZE;
445 bi = (__m128i*)in;
446 bo = (__m128i*)out;
447
448 ks = this->key->schedule;
449
450 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
451 {
452 d1 = _mm_loadu_si128(bi + i + 0);
453 d2 = _mm_loadu_si128(bi + i + 1);
454 d3 = _mm_loadu_si128(bi + i + 2);
455 d4 = _mm_loadu_si128(bi + i + 3);
456
457 t1 = _mm_xor_si128(cb, ks[0]);
458 cb = increment_be(cb);
459 t2 = _mm_xor_si128(cb, ks[0]);
460 cb = increment_be(cb);
461 t3 = _mm_xor_si128(cb, ks[0]);
462 cb = increment_be(cb);
463 t4 = _mm_xor_si128(cb, ks[0]);
464 cb = increment_be(cb);
465
466 t1 = _mm_aesenc_si128(t1, ks[1]);
467 t2 = _mm_aesenc_si128(t2, ks[1]);
468 t3 = _mm_aesenc_si128(t3, ks[1]);
469 t4 = _mm_aesenc_si128(t4, ks[1]);
470 t1 = _mm_aesenc_si128(t1, ks[2]);
471 t2 = _mm_aesenc_si128(t2, ks[2]);
472 t3 = _mm_aesenc_si128(t3, ks[2]);
473 t4 = _mm_aesenc_si128(t4, ks[2]);
474 t1 = _mm_aesenc_si128(t1, ks[3]);
475 t2 = _mm_aesenc_si128(t2, ks[3]);
476 t3 = _mm_aesenc_si128(t3, ks[3]);
477 t4 = _mm_aesenc_si128(t4, ks[3]);
478 t1 = _mm_aesenc_si128(t1, ks[4]);
479 t2 = _mm_aesenc_si128(t2, ks[4]);
480 t3 = _mm_aesenc_si128(t3, ks[4]);
481 t4 = _mm_aesenc_si128(t4, ks[4]);
482 t1 = _mm_aesenc_si128(t1, ks[5]);
483 t2 = _mm_aesenc_si128(t2, ks[5]);
484 t3 = _mm_aesenc_si128(t3, ks[5]);
485 t4 = _mm_aesenc_si128(t4, ks[5]);
486 t1 = _mm_aesenc_si128(t1, ks[6]);
487 t2 = _mm_aesenc_si128(t2, ks[6]);
488 t3 = _mm_aesenc_si128(t3, ks[6]);
489 t4 = _mm_aesenc_si128(t4, ks[6]);
490 t1 = _mm_aesenc_si128(t1, ks[7]);
491 t2 = _mm_aesenc_si128(t2, ks[7]);
492 t3 = _mm_aesenc_si128(t3, ks[7]);
493 t4 = _mm_aesenc_si128(t4, ks[7]);
494 t1 = _mm_aesenc_si128(t1, ks[8]);
495 t2 = _mm_aesenc_si128(t2, ks[8]);
496 t3 = _mm_aesenc_si128(t3, ks[8]);
497 t4 = _mm_aesenc_si128(t4, ks[8]);
498 t1 = _mm_aesenc_si128(t1, ks[9]);
499 t2 = _mm_aesenc_si128(t2, ks[9]);
500 t3 = _mm_aesenc_si128(t3, ks[9]);
501 t4 = _mm_aesenc_si128(t4, ks[9]);
502
503 t1 = _mm_aesenclast_si128(t1, ks[10]);
504 t2 = _mm_aesenclast_si128(t2, ks[10]);
505 t3 = _mm_aesenclast_si128(t3, ks[10]);
506 t4 = _mm_aesenclast_si128(t4, ks[10]);
507
508 t1 = _mm_xor_si128(t1, d1);
509 t2 = _mm_xor_si128(t2, d2);
510 t3 = _mm_xor_si128(t3, d3);
511 t4 = _mm_xor_si128(t4, d4);
512
513 y = _mm_xor_si128(y, t1);
514 y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
515
516 _mm_storeu_si128(bo + i + 0, t1);
517 _mm_storeu_si128(bo + i + 1, t2);
518 _mm_storeu_si128(bo + i + 2, t3);
519 _mm_storeu_si128(bo + i + 3, t4);
520 }
521
522 for (i = pblocks; i < blocks; i++)
523 {
524 d1 = _mm_loadu_si128(bi + i);
525
526 t1 = _mm_xor_si128(cb, ks[0]);
527 t1 = _mm_aesenc_si128(t1, ks[1]);
528 t1 = _mm_aesenc_si128(t1, ks[2]);
529 t1 = _mm_aesenc_si128(t1, ks[3]);
530 t1 = _mm_aesenc_si128(t1, ks[4]);
531 t1 = _mm_aesenc_si128(t1, ks[5]);
532 t1 = _mm_aesenc_si128(t1, ks[6]);
533 t1 = _mm_aesenc_si128(t1, ks[7]);
534 t1 = _mm_aesenc_si128(t1, ks[8]);
535 t1 = _mm_aesenc_si128(t1, ks[9]);
536 t1 = _mm_aesenclast_si128(t1, ks[10]);
537
538 t1 = _mm_xor_si128(t1, d1);
539 _mm_storeu_si128(bo + i, t1);
540
541 y = ghash(this->h, y, t1);
542
543 cb = increment_be(cb);
544 }
545
546 if (rem)
547 {
548 y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
549 }
550 y = icv_tailer(this, y, alen, len);
551 icv_crypt(this, y, j, icv);
552 }
553
554 /**
555 * AES-128 GCM decryption/ICV generation
556 */
557 static void decrypt_gcm128(private_aesni_gcm_t *this,
558 size_t len, u_char *in, u_char *out, u_char *iv,
559 size_t alen, u_char *assoc, u_char *icv)
560 {
561 __m128i d1, d2, d3, d4, t1, t2, t3, t4;
562 __m128i *ks, y, j, cb, *bi, *bo;
563 u_int blocks, pblocks, rem, i;
564
565 j = create_j(this, iv);
566 cb = increment_be(j);
567 y = icv_header(this, assoc, alen);
568 blocks = len / AES_BLOCK_SIZE;
569 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
570 rem = len % AES_BLOCK_SIZE;
571 bi = (__m128i*)in;
572 bo = (__m128i*)out;
573
574 ks = this->key->schedule;
575
576 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
577 {
578 d1 = _mm_loadu_si128(bi + i + 0);
579 d2 = _mm_loadu_si128(bi + i + 1);
580 d3 = _mm_loadu_si128(bi + i + 2);
581 d4 = _mm_loadu_si128(bi + i + 3);
582
583 y = _mm_xor_si128(y, d1);
584 y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
585
586 t1 = _mm_xor_si128(cb, ks[0]);
587 cb = increment_be(cb);
588 t2 = _mm_xor_si128(cb, ks[0]);
589 cb = increment_be(cb);
590 t3 = _mm_xor_si128(cb, ks[0]);
591 cb = increment_be(cb);
592 t4 = _mm_xor_si128(cb, ks[0]);
593 cb = increment_be(cb);
594
595 t1 = _mm_aesenc_si128(t1, ks[1]);
596 t2 = _mm_aesenc_si128(t2, ks[1]);
597 t3 = _mm_aesenc_si128(t3, ks[1]);
598 t4 = _mm_aesenc_si128(t4, ks[1]);
599 t1 = _mm_aesenc_si128(t1, ks[2]);
600 t2 = _mm_aesenc_si128(t2, ks[2]);
601 t3 = _mm_aesenc_si128(t3, ks[2]);
602 t4 = _mm_aesenc_si128(t4, ks[2]);
603 t1 = _mm_aesenc_si128(t1, ks[3]);
604 t2 = _mm_aesenc_si128(t2, ks[3]);
605 t3 = _mm_aesenc_si128(t3, ks[3]);
606 t4 = _mm_aesenc_si128(t4, ks[3]);
607 t1 = _mm_aesenc_si128(t1, ks[4]);
608 t2 = _mm_aesenc_si128(t2, ks[4]);
609 t3 = _mm_aesenc_si128(t3, ks[4]);
610 t4 = _mm_aesenc_si128(t4, ks[4]);
611 t1 = _mm_aesenc_si128(t1, ks[5]);
612 t2 = _mm_aesenc_si128(t2, ks[5]);
613 t3 = _mm_aesenc_si128(t3, ks[5]);
614 t4 = _mm_aesenc_si128(t4, ks[5]);
615 t1 = _mm_aesenc_si128(t1, ks[6]);
616 t2 = _mm_aesenc_si128(t2, ks[6]);
617 t3 = _mm_aesenc_si128(t3, ks[6]);
618 t4 = _mm_aesenc_si128(t4, ks[6]);
619 t1 = _mm_aesenc_si128(t1, ks[7]);
620 t2 = _mm_aesenc_si128(t2, ks[7]);
621 t3 = _mm_aesenc_si128(t3, ks[7]);
622 t4 = _mm_aesenc_si128(t4, ks[7]);
623 t1 = _mm_aesenc_si128(t1, ks[8]);
624 t2 = _mm_aesenc_si128(t2, ks[8]);
625 t3 = _mm_aesenc_si128(t3, ks[8]);
626 t4 = _mm_aesenc_si128(t4, ks[8]);
627 t1 = _mm_aesenc_si128(t1, ks[9]);
628 t2 = _mm_aesenc_si128(t2, ks[9]);
629 t3 = _mm_aesenc_si128(t3, ks[9]);
630 t4 = _mm_aesenc_si128(t4, ks[9]);
631
632 t1 = _mm_aesenclast_si128(t1, ks[10]);
633 t2 = _mm_aesenclast_si128(t2, ks[10]);
634 t3 = _mm_aesenclast_si128(t3, ks[10]);
635 t4 = _mm_aesenclast_si128(t4, ks[10]);
636
637 t1 = _mm_xor_si128(t1, d1);
638 t2 = _mm_xor_si128(t2, d2);
639 t3 = _mm_xor_si128(t3, d3);
640 t4 = _mm_xor_si128(t4, d4);
641
642 _mm_storeu_si128(bo + i + 0, t1);
643 _mm_storeu_si128(bo + i + 1, t2);
644 _mm_storeu_si128(bo + i + 2, t3);
645 _mm_storeu_si128(bo + i + 3, t4);
646 }
647
648 for (i = pblocks; i < blocks; i++)
649 {
650 d1 = _mm_loadu_si128(bi + i);
651
652 y = ghash(this->h, y, d1);
653
654 t1 = _mm_xor_si128(cb, ks[0]);
655 t1 = _mm_aesenc_si128(t1, ks[1]);
656 t1 = _mm_aesenc_si128(t1, ks[2]);
657 t1 = _mm_aesenc_si128(t1, ks[3]);
658 t1 = _mm_aesenc_si128(t1, ks[4]);
659 t1 = _mm_aesenc_si128(t1, ks[5]);
660 t1 = _mm_aesenc_si128(t1, ks[6]);
661 t1 = _mm_aesenc_si128(t1, ks[7]);
662 t1 = _mm_aesenc_si128(t1, ks[8]);
663 t1 = _mm_aesenc_si128(t1, ks[9]);
664 t1 = _mm_aesenclast_si128(t1, ks[10]);
665
666 t1 = _mm_xor_si128(t1, d1);
667 _mm_storeu_si128(bo + i, t1);
668
669 cb = increment_be(cb);
670 }
671
672 if (rem)
673 {
674 y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
675 }
676 y = icv_tailer(this, y, alen, len);
677 icv_crypt(this, y, j, icv);
678 }
679
680 /**
681 * AES-192 GCM encryption/ICV generation
682 */
683 static void encrypt_gcm192(private_aesni_gcm_t *this,
684 size_t len, u_char *in, u_char *out, u_char *iv,
685 size_t alen, u_char *assoc, u_char *icv)
686 {
687 __m128i d1, d2, d3, d4, t1, t2, t3, t4;
688 __m128i *ks, y, j, cb, *bi, *bo;
689 u_int blocks, pblocks, rem, i;
690
691 j = create_j(this, iv);
692 cb = increment_be(j);
693 y = icv_header(this, assoc, alen);
694 blocks = len / AES_BLOCK_SIZE;
695 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
696 rem = len % AES_BLOCK_SIZE;
697 bi = (__m128i*)in;
698 bo = (__m128i*)out;
699
700 ks = this->key->schedule;
701
702 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
703 {
704 d1 = _mm_loadu_si128(bi + i + 0);
705 d2 = _mm_loadu_si128(bi + i + 1);
706 d3 = _mm_loadu_si128(bi + i + 2);
707 d4 = _mm_loadu_si128(bi + i + 3);
708
709 t1 = _mm_xor_si128(cb, ks[0]);
710 cb = increment_be(cb);
711 t2 = _mm_xor_si128(cb, ks[0]);
712 cb = increment_be(cb);
713 t3 = _mm_xor_si128(cb, ks[0]);
714 cb = increment_be(cb);
715 t4 = _mm_xor_si128(cb, ks[0]);
716 cb = increment_be(cb);
717
718 t1 = _mm_aesenc_si128(t1, ks[1]);
719 t2 = _mm_aesenc_si128(t2, ks[1]);
720 t3 = _mm_aesenc_si128(t3, ks[1]);
721 t4 = _mm_aesenc_si128(t4, ks[1]);
722 t1 = _mm_aesenc_si128(t1, ks[2]);
723 t2 = _mm_aesenc_si128(t2, ks[2]);
724 t3 = _mm_aesenc_si128(t3, ks[2]);
725 t4 = _mm_aesenc_si128(t4, ks[2]);
726 t1 = _mm_aesenc_si128(t1, ks[3]);
727 t2 = _mm_aesenc_si128(t2, ks[3]);
728 t3 = _mm_aesenc_si128(t3, ks[3]);
729 t4 = _mm_aesenc_si128(t4, ks[3]);
730 t1 = _mm_aesenc_si128(t1, ks[4]);
731 t2 = _mm_aesenc_si128(t2, ks[4]);
732 t3 = _mm_aesenc_si128(t3, ks[4]);
733 t4 = _mm_aesenc_si128(t4, ks[4]);
734 t1 = _mm_aesenc_si128(t1, ks[5]);
735 t2 = _mm_aesenc_si128(t2, ks[5]);
736 t3 = _mm_aesenc_si128(t3, ks[5]);
737 t4 = _mm_aesenc_si128(t4, ks[5]);
738 t1 = _mm_aesenc_si128(t1, ks[6]);
739 t2 = _mm_aesenc_si128(t2, ks[6]);
740 t3 = _mm_aesenc_si128(t3, ks[6]);
741 t4 = _mm_aesenc_si128(t4, ks[6]);
742 t1 = _mm_aesenc_si128(t1, ks[7]);
743 t2 = _mm_aesenc_si128(t2, ks[7]);
744 t3 = _mm_aesenc_si128(t3, ks[7]);
745 t4 = _mm_aesenc_si128(t4, ks[7]);
746 t1 = _mm_aesenc_si128(t1, ks[8]);
747 t2 = _mm_aesenc_si128(t2, ks[8]);
748 t3 = _mm_aesenc_si128(t3, ks[8]);
749 t4 = _mm_aesenc_si128(t4, ks[8]);
750 t1 = _mm_aesenc_si128(t1, ks[9]);
751 t2 = _mm_aesenc_si128(t2, ks[9]);
752 t3 = _mm_aesenc_si128(t3, ks[9]);
753 t4 = _mm_aesenc_si128(t4, ks[9]);
754 t1 = _mm_aesenc_si128(t1, ks[10]);
755 t2 = _mm_aesenc_si128(t2, ks[10]);
756 t3 = _mm_aesenc_si128(t3, ks[10]);
757 t4 = _mm_aesenc_si128(t4, ks[10]);
758 t1 = _mm_aesenc_si128(t1, ks[11]);
759 t2 = _mm_aesenc_si128(t2, ks[11]);
760 t3 = _mm_aesenc_si128(t3, ks[11]);
761 t4 = _mm_aesenc_si128(t4, ks[11]);
762
763 t1 = _mm_aesenclast_si128(t1, ks[12]);
764 t2 = _mm_aesenclast_si128(t2, ks[12]);
765 t3 = _mm_aesenclast_si128(t3, ks[12]);
766 t4 = _mm_aesenclast_si128(t4, ks[12]);
767
768 t1 = _mm_xor_si128(t1, d1);
769 t2 = _mm_xor_si128(t2, d2);
770 t3 = _mm_xor_si128(t3, d3);
771 t4 = _mm_xor_si128(t4, d4);
772
773 y = _mm_xor_si128(y, t1);
774 y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
775
776 _mm_storeu_si128(bo + i + 0, t1);
777 _mm_storeu_si128(bo + i + 1, t2);
778 _mm_storeu_si128(bo + i + 2, t3);
779 _mm_storeu_si128(bo + i + 3, t4);
780 }
781
782 for (i = pblocks; i < blocks; i++)
783 {
784 d1 = _mm_loadu_si128(bi + i);
785
786 t1 = _mm_xor_si128(cb, ks[0]);
787 t1 = _mm_aesenc_si128(t1, ks[1]);
788 t1 = _mm_aesenc_si128(t1, ks[2]);
789 t1 = _mm_aesenc_si128(t1, ks[3]);
790 t1 = _mm_aesenc_si128(t1, ks[4]);
791 t1 = _mm_aesenc_si128(t1, ks[5]);
792 t1 = _mm_aesenc_si128(t1, ks[6]);
793 t1 = _mm_aesenc_si128(t1, ks[7]);
794 t1 = _mm_aesenc_si128(t1, ks[8]);
795 t1 = _mm_aesenc_si128(t1, ks[9]);
796 t1 = _mm_aesenc_si128(t1, ks[10]);
797 t1 = _mm_aesenc_si128(t1, ks[11]);
798 t1 = _mm_aesenclast_si128(t1, ks[12]);
799
800 t1 = _mm_xor_si128(t1, d1);
801 _mm_storeu_si128(bo + i, t1);
802
803 y = ghash(this->h, y, t1);
804
805 cb = increment_be(cb);
806 }
807
808 if (rem)
809 {
810 y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
811 }
812 y = icv_tailer(this, y, alen, len);
813 icv_crypt(this, y, j, icv);
814 }
815
816 /**
817 * AES-192 GCM decryption/ICV generation
818 */
819 static void decrypt_gcm192(private_aesni_gcm_t *this,
820 size_t len, u_char *in, u_char *out, u_char *iv,
821 size_t alen, u_char *assoc, u_char *icv)
822 {
823 __m128i d1, d2, d3, d4, t1, t2, t3, t4;
824 __m128i *ks, y, j, cb, *bi, *bo;
825 u_int blocks, pblocks, rem, i;
826
827 j = create_j(this, iv);
828 cb = increment_be(j);
829 y = icv_header(this, assoc, alen);
830 blocks = len / AES_BLOCK_SIZE;
831 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
832 rem = len % AES_BLOCK_SIZE;
833 bi = (__m128i*)in;
834 bo = (__m128i*)out;
835
836 ks = this->key->schedule;
837
838 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
839 {
840 d1 = _mm_loadu_si128(bi + i + 0);
841 d2 = _mm_loadu_si128(bi + i + 1);
842 d3 = _mm_loadu_si128(bi + i + 2);
843 d4 = _mm_loadu_si128(bi + i + 3);
844
845 y = _mm_xor_si128(y, d1);
846 y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
847
848 t1 = _mm_xor_si128(cb, ks[0]);
849 cb = increment_be(cb);
850 t2 = _mm_xor_si128(cb, ks[0]);
851 cb = increment_be(cb);
852 t3 = _mm_xor_si128(cb, ks[0]);
853 cb = increment_be(cb);
854 t4 = _mm_xor_si128(cb, ks[0]);
855 cb = increment_be(cb);
856
857 t1 = _mm_aesenc_si128(t1, ks[1]);
858 t2 = _mm_aesenc_si128(t2, ks[1]);
859 t3 = _mm_aesenc_si128(t3, ks[1]);
860 t4 = _mm_aesenc_si128(t4, ks[1]);
861 t1 = _mm_aesenc_si128(t1, ks[2]);
862 t2 = _mm_aesenc_si128(t2, ks[2]);
863 t3 = _mm_aesenc_si128(t3, ks[2]);
864 t4 = _mm_aesenc_si128(t4, ks[2]);
865 t1 = _mm_aesenc_si128(t1, ks[3]);
866 t2 = _mm_aesenc_si128(t2, ks[3]);
867 t3 = _mm_aesenc_si128(t3, ks[3]);
868 t4 = _mm_aesenc_si128(t4, ks[3]);
869 t1 = _mm_aesenc_si128(t1, ks[4]);
870 t2 = _mm_aesenc_si128(t2, ks[4]);
871 t3 = _mm_aesenc_si128(t3, ks[4]);
872 t4 = _mm_aesenc_si128(t4, ks[4]);
873 t1 = _mm_aesenc_si128(t1, ks[5]);
874 t2 = _mm_aesenc_si128(t2, ks[5]);
875 t3 = _mm_aesenc_si128(t3, ks[5]);
876 t4 = _mm_aesenc_si128(t4, ks[5]);
877 t1 = _mm_aesenc_si128(t1, ks[6]);
878 t2 = _mm_aesenc_si128(t2, ks[6]);
879 t3 = _mm_aesenc_si128(t3, ks[6]);
880 t4 = _mm_aesenc_si128(t4, ks[6]);
881 t1 = _mm_aesenc_si128(t1, ks[7]);
882 t2 = _mm_aesenc_si128(t2, ks[7]);
883 t3 = _mm_aesenc_si128(t3, ks[7]);
884 t4 = _mm_aesenc_si128(t4, ks[7]);
885 t1 = _mm_aesenc_si128(t1, ks[8]);
886 t2 = _mm_aesenc_si128(t2, ks[8]);
887 t3 = _mm_aesenc_si128(t3, ks[8]);
888 t4 = _mm_aesenc_si128(t4, ks[8]);
889 t1 = _mm_aesenc_si128(t1, ks[9]);
890 t2 = _mm_aesenc_si128(t2, ks[9]);
891 t3 = _mm_aesenc_si128(t3, ks[9]);
892 t4 = _mm_aesenc_si128(t4, ks[9]);
893 t1 = _mm_aesenc_si128(t1, ks[10]);
894 t2 = _mm_aesenc_si128(t2, ks[10]);
895 t3 = _mm_aesenc_si128(t3, ks[10]);
896 t4 = _mm_aesenc_si128(t4, ks[10]);
897 t1 = _mm_aesenc_si128(t1, ks[11]);
898 t2 = _mm_aesenc_si128(t2, ks[11]);
899 t3 = _mm_aesenc_si128(t3, ks[11]);
900 t4 = _mm_aesenc_si128(t4, ks[11]);
901
902 t1 = _mm_aesenclast_si128(t1, ks[12]);
903 t2 = _mm_aesenclast_si128(t2, ks[12]);
904 t3 = _mm_aesenclast_si128(t3, ks[12]);
905 t4 = _mm_aesenclast_si128(t4, ks[12]);
906
907 t1 = _mm_xor_si128(t1, d1);
908 t2 = _mm_xor_si128(t2, d2);
909 t3 = _mm_xor_si128(t3, d3);
910 t4 = _mm_xor_si128(t4, d4);
911
912 _mm_storeu_si128(bo + i + 0, t1);
913 _mm_storeu_si128(bo + i + 1, t2);
914 _mm_storeu_si128(bo + i + 2, t3);
915 _mm_storeu_si128(bo + i + 3, t4);
916 }
917
918 for (i = pblocks; i < blocks; i++)
919 {
920 d1 = _mm_loadu_si128(bi + i);
921
922 y = ghash(this->h, y, d1);
923
924 t1 = _mm_xor_si128(cb, ks[0]);
925 t1 = _mm_aesenc_si128(t1, ks[1]);
926 t1 = _mm_aesenc_si128(t1, ks[2]);
927 t1 = _mm_aesenc_si128(t1, ks[3]);
928 t1 = _mm_aesenc_si128(t1, ks[4]);
929 t1 = _mm_aesenc_si128(t1, ks[5]);
930 t1 = _mm_aesenc_si128(t1, ks[6]);
931 t1 = _mm_aesenc_si128(t1, ks[7]);
932 t1 = _mm_aesenc_si128(t1, ks[8]);
933 t1 = _mm_aesenc_si128(t1, ks[9]);
934 t1 = _mm_aesenc_si128(t1, ks[10]);
935 t1 = _mm_aesenc_si128(t1, ks[11]);
936 t1 = _mm_aesenclast_si128(t1, ks[12]);
937
938 t1 = _mm_xor_si128(t1, d1);
939 _mm_storeu_si128(bo + i, t1);
940
941 cb = increment_be(cb);
942 }
943
944 if (rem)
945 {
946 y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
947 }
948 y = icv_tailer(this, y, alen, len);
949 icv_crypt(this, y, j, icv);
950 }
951
952 /**
953 * AES-256 GCM encryption/ICV generation
954 */
955 static void encrypt_gcm256(private_aesni_gcm_t *this,
956 size_t len, u_char *in, u_char *out, u_char *iv,
957 size_t alen, u_char *assoc, u_char *icv)
958 {
959 __m128i d1, d2, d3, d4, t1, t2, t3, t4;
960 __m128i *ks, y, j, cb, *bi, *bo;
961 u_int blocks, pblocks, rem, i;
962
963 j = create_j(this, iv);
964 cb = increment_be(j);
965 y = icv_header(this, assoc, alen);
966 blocks = len / AES_BLOCK_SIZE;
967 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
968 rem = len % AES_BLOCK_SIZE;
969 bi = (__m128i*)in;
970 bo = (__m128i*)out;
971
972 ks = this->key->schedule;
973
974 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
975 {
976 d1 = _mm_loadu_si128(bi + i + 0);
977 d2 = _mm_loadu_si128(bi + i + 1);
978 d3 = _mm_loadu_si128(bi + i + 2);
979 d4 = _mm_loadu_si128(bi + i + 3);
980
981 t1 = _mm_xor_si128(cb, ks[0]);
982 cb = increment_be(cb);
983 t2 = _mm_xor_si128(cb, ks[0]);
984 cb = increment_be(cb);
985 t3 = _mm_xor_si128(cb, ks[0]);
986 cb = increment_be(cb);
987 t4 = _mm_xor_si128(cb, ks[0]);
988 cb = increment_be(cb);
989
990 t1 = _mm_aesenc_si128(t1, ks[1]);
991 t2 = _mm_aesenc_si128(t2, ks[1]);
992 t3 = _mm_aesenc_si128(t3, ks[1]);
993 t4 = _mm_aesenc_si128(t4, ks[1]);
994 t1 = _mm_aesenc_si128(t1, ks[2]);
995 t2 = _mm_aesenc_si128(t2, ks[2]);
996 t3 = _mm_aesenc_si128(t3, ks[2]);
997 t4 = _mm_aesenc_si128(t4, ks[2]);
998 t1 = _mm_aesenc_si128(t1, ks[3]);
999 t2 = _mm_aesenc_si128(t2, ks[3]);
1000 t3 = _mm_aesenc_si128(t3, ks[3]);
1001 t4 = _mm_aesenc_si128(t4, ks[3]);
1002 t1 = _mm_aesenc_si128(t1, ks[4]);
1003 t2 = _mm_aesenc_si128(t2, ks[4]);
1004 t3 = _mm_aesenc_si128(t3, ks[4]);
1005 t4 = _mm_aesenc_si128(t4, ks[4]);
1006 t1 = _mm_aesenc_si128(t1, ks[5]);
1007 t2 = _mm_aesenc_si128(t2, ks[5]);
1008 t3 = _mm_aesenc_si128(t3, ks[5]);
1009 t4 = _mm_aesenc_si128(t4, ks[5]);
1010 t1 = _mm_aesenc_si128(t1, ks[6]);
1011 t2 = _mm_aesenc_si128(t2, ks[6]);
1012 t3 = _mm_aesenc_si128(t3, ks[6]);
1013 t4 = _mm_aesenc_si128(t4, ks[6]);
1014 t1 = _mm_aesenc_si128(t1, ks[7]);
1015 t2 = _mm_aesenc_si128(t2, ks[7]);
1016 t3 = _mm_aesenc_si128(t3, ks[7]);
1017 t4 = _mm_aesenc_si128(t4, ks[7]);
1018 t1 = _mm_aesenc_si128(t1, ks[8]);
1019 t2 = _mm_aesenc_si128(t2, ks[8]);
1020 t3 = _mm_aesenc_si128(t3, ks[8]);
1021 t4 = _mm_aesenc_si128(t4, ks[8]);
1022 t1 = _mm_aesenc_si128(t1, ks[9]);
1023 t2 = _mm_aesenc_si128(t2, ks[9]);
1024 t3 = _mm_aesenc_si128(t3, ks[9]);
1025 t4 = _mm_aesenc_si128(t4, ks[9]);
1026 t1 = _mm_aesenc_si128(t1, ks[10]);
1027 t2 = _mm_aesenc_si128(t2, ks[10]);
1028 t3 = _mm_aesenc_si128(t3, ks[10]);
1029 t4 = _mm_aesenc_si128(t4, ks[10]);
1030 t1 = _mm_aesenc_si128(t1, ks[11]);
1031 t2 = _mm_aesenc_si128(t2, ks[11]);
1032 t3 = _mm_aesenc_si128(t3, ks[11]);
1033 t4 = _mm_aesenc_si128(t4, ks[11]);
1034 t1 = _mm_aesenc_si128(t1, ks[12]);
1035 t2 = _mm_aesenc_si128(t2, ks[12]);
1036 t3 = _mm_aesenc_si128(t3, ks[12]);
1037 t4 = _mm_aesenc_si128(t4, ks[12]);
1038 t1 = _mm_aesenc_si128(t1, ks[13]);
1039 t2 = _mm_aesenc_si128(t2, ks[13]);
1040 t3 = _mm_aesenc_si128(t3, ks[13]);
1041 t4 = _mm_aesenc_si128(t4, ks[13]);
1042
1043 t1 = _mm_aesenclast_si128(t1, ks[14]);
1044 t2 = _mm_aesenclast_si128(t2, ks[14]);
1045 t3 = _mm_aesenclast_si128(t3, ks[14]);
1046 t4 = _mm_aesenclast_si128(t4, ks[14]);
1047
1048 t1 = _mm_xor_si128(t1, d1);
1049 t2 = _mm_xor_si128(t2, d2);
1050 t3 = _mm_xor_si128(t3, d3);
1051 t4 = _mm_xor_si128(t4, d4);
1052
1053 y = _mm_xor_si128(y, t1);
1054 y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
1055
1056 _mm_storeu_si128(bo + i + 0, t1);
1057 _mm_storeu_si128(bo + i + 1, t2);
1058 _mm_storeu_si128(bo + i + 2, t3);
1059 _mm_storeu_si128(bo + i + 3, t4);
1060 }
1061
1062 for (i = pblocks; i < blocks; i++)
1063 {
1064 d1 = _mm_loadu_si128(bi + i);
1065
1066 t1 = _mm_xor_si128(cb, ks[0]);
1067 t1 = _mm_aesenc_si128(t1, ks[1]);
1068 t1 = _mm_aesenc_si128(t1, ks[2]);
1069 t1 = _mm_aesenc_si128(t1, ks[3]);
1070 t1 = _mm_aesenc_si128(t1, ks[4]);
1071 t1 = _mm_aesenc_si128(t1, ks[5]);
1072 t1 = _mm_aesenc_si128(t1, ks[6]);
1073 t1 = _mm_aesenc_si128(t1, ks[7]);
1074 t1 = _mm_aesenc_si128(t1, ks[8]);
1075 t1 = _mm_aesenc_si128(t1, ks[9]);
1076 t1 = _mm_aesenc_si128(t1, ks[10]);
1077 t1 = _mm_aesenc_si128(t1, ks[11]);
1078 t1 = _mm_aesenc_si128(t1, ks[12]);
1079 t1 = _mm_aesenc_si128(t1, ks[13]);
1080 t1 = _mm_aesenclast_si128(t1, ks[14]);
1081
1082 t1 = _mm_xor_si128(t1, d1);
1083 _mm_storeu_si128(bo + i, t1);
1084
1085 y = ghash(this->h, y, t1);
1086
1087 cb = increment_be(cb);
1088 }
1089
1090 if (rem)
1091 {
1092 y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
1093 }
1094 y = icv_tailer(this, y, alen, len);
1095 icv_crypt(this, y, j, icv);
1096 }
1097
1098 /**
1099 * AES-256 GCM decryption/ICV generation
1100 */
1101 static void decrypt_gcm256(private_aesni_gcm_t *this,
1102 size_t len, u_char *in, u_char *out, u_char *iv,
1103 size_t alen, u_char *assoc, u_char *icv)
1104 {
1105 __m128i d1, d2, d3, d4, t1, t2, t3, t4;
1106 __m128i *ks, y, j, cb, *bi, *bo;
1107 u_int blocks, pblocks, rem, i;
1108
1109 j = create_j(this, iv);
1110 cb = increment_be(j);
1111 y = icv_header(this, assoc, alen);
1112 blocks = len / AES_BLOCK_SIZE;
1113 pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
1114 rem = len % AES_BLOCK_SIZE;
1115 bi = (__m128i*)in;
1116 bo = (__m128i*)out;
1117
1118 ks = this->key->schedule;
1119
1120 for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
1121 {
1122 d1 = _mm_loadu_si128(bi + i + 0);
1123 d2 = _mm_loadu_si128(bi + i + 1);
1124 d3 = _mm_loadu_si128(bi + i + 2);
1125 d4 = _mm_loadu_si128(bi + i + 3);
1126
1127 y = _mm_xor_si128(y, d1);
1128 y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
1129
1130 t1 = _mm_xor_si128(cb, ks[0]);
1131 cb = increment_be(cb);
1132 t2 = _mm_xor_si128(cb, ks[0]);
1133 cb = increment_be(cb);
1134 t3 = _mm_xor_si128(cb, ks[0]);
1135 cb = increment_be(cb);
1136 t4 = _mm_xor_si128(cb, ks[0]);
1137 cb = increment_be(cb);
1138
1139 t1 = _mm_aesenc_si128(t1, ks[1]);
1140 t2 = _mm_aesenc_si128(t2, ks[1]);
1141 t3 = _mm_aesenc_si128(t3, ks[1]);
1142 t4 = _mm_aesenc_si128(t4, ks[1]);
1143 t1 = _mm_aesenc_si128(t1, ks[2]);
1144 t2 = _mm_aesenc_si128(t2, ks[2]);
1145 t3 = _mm_aesenc_si128(t3, ks[2]);
1146 t4 = _mm_aesenc_si128(t4, ks[2]);
1147 t1 = _mm_aesenc_si128(t1, ks[3]);
1148 t2 = _mm_aesenc_si128(t2, ks[3]);
1149 t3 = _mm_aesenc_si128(t3, ks[3]);
1150 t4 = _mm_aesenc_si128(t4, ks[3]);
1151 t1 = _mm_aesenc_si128(t1, ks[4]);
1152 t2 = _mm_aesenc_si128(t2, ks[4]);
1153 t3 = _mm_aesenc_si128(t3, ks[4]);
1154 t4 = _mm_aesenc_si128(t4, ks[4]);
1155 t1 = _mm_aesenc_si128(t1, ks[5]);
1156 t2 = _mm_aesenc_si128(t2, ks[5]);
1157 t3 = _mm_aesenc_si128(t3, ks[5]);
1158 t4 = _mm_aesenc_si128(t4, ks[5]);
1159 t1 = _mm_aesenc_si128(t1, ks[6]);
1160 t2 = _mm_aesenc_si128(t2, ks[6]);
1161 t3 = _mm_aesenc_si128(t3, ks[6]);
1162 t4 = _mm_aesenc_si128(t4, ks[6]);
1163 t1 = _mm_aesenc_si128(t1, ks[7]);
1164 t2 = _mm_aesenc_si128(t2, ks[7]);
1165 t3 = _mm_aesenc_si128(t3, ks[7]);
1166 t4 = _mm_aesenc_si128(t4, ks[7]);
1167 t1 = _mm_aesenc_si128(t1, ks[8]);
1168 t2 = _mm_aesenc_si128(t2, ks[8]);
1169 t3 = _mm_aesenc_si128(t3, ks[8]);
1170 t4 = _mm_aesenc_si128(t4, ks[8]);
1171 t1 = _mm_aesenc_si128(t1, ks[9]);
1172 t2 = _mm_aesenc_si128(t2, ks[9]);
1173 t3 = _mm_aesenc_si128(t3, ks[9]);
1174 t4 = _mm_aesenc_si128(t4, ks[9]);
1175 t1 = _mm_aesenc_si128(t1, ks[10]);
1176 t2 = _mm_aesenc_si128(t2, ks[10]);
1177 t3 = _mm_aesenc_si128(t3, ks[10]);
1178 t4 = _mm_aesenc_si128(t4, ks[10]);
1179 t1 = _mm_aesenc_si128(t1, ks[11]);
1180 t2 = _mm_aesenc_si128(t2, ks[11]);
1181 t3 = _mm_aesenc_si128(t3, ks[11]);
1182 t4 = _mm_aesenc_si128(t4, ks[11]);
1183 t1 = _mm_aesenc_si128(t1, ks[12]);
1184 t2 = _mm_aesenc_si128(t2, ks[12]);
1185 t3 = _mm_aesenc_si128(t3, ks[12]);
1186 t4 = _mm_aesenc_si128(t4, ks[12]);
1187 t1 = _mm_aesenc_si128(t1, ks[13]);
1188 t2 = _mm_aesenc_si128(t2, ks[13]);
1189 t3 = _mm_aesenc_si128(t3, ks[13]);
1190 t4 = _mm_aesenc_si128(t4, ks[13]);
1191
1192 t1 = _mm_aesenclast_si128(t1, ks[14]);
1193 t2 = _mm_aesenclast_si128(t2, ks[14]);
1194 t3 = _mm_aesenclast_si128(t3, ks[14]);
1195 t4 = _mm_aesenclast_si128(t4, ks[14]);
1196
1197 t1 = _mm_xor_si128(t1, d1);
1198 t2 = _mm_xor_si128(t2, d2);
1199 t3 = _mm_xor_si128(t3, d3);
1200 t4 = _mm_xor_si128(t4, d4);
1201
1202 _mm_storeu_si128(bo + i + 0, t1);
1203 _mm_storeu_si128(bo + i + 1, t2);
1204 _mm_storeu_si128(bo + i + 2, t3);
1205 _mm_storeu_si128(bo + i + 3, t4);
1206 }
1207
1208 for (i = pblocks; i < blocks; i++)
1209 {
1210 d1 = _mm_loadu_si128(bi + i);
1211
1212 y = ghash(this->h, y, d1);
1213
1214 t1 = _mm_xor_si128(cb, ks[0]);
1215 t1 = _mm_aesenc_si128(t1, ks[1]);
1216 t1 = _mm_aesenc_si128(t1, ks[2]);
1217 t1 = _mm_aesenc_si128(t1, ks[3]);
1218 t1 = _mm_aesenc_si128(t1, ks[4]);
1219 t1 = _mm_aesenc_si128(t1, ks[5]);
1220 t1 = _mm_aesenc_si128(t1, ks[6]);
1221 t1 = _mm_aesenc_si128(t1, ks[7]);
1222 t1 = _mm_aesenc_si128(t1, ks[8]);
1223 t1 = _mm_aesenc_si128(t1, ks[9]);
1224 t1 = _mm_aesenc_si128(t1, ks[10]);
1225 t1 = _mm_aesenc_si128(t1, ks[11]);
1226 t1 = _mm_aesenc_si128(t1, ks[12]);
1227 t1 = _mm_aesenc_si128(t1, ks[13]);
1228 t1 = _mm_aesenclast_si128(t1, ks[14]);
1229
1230 t1 = _mm_xor_si128(t1, d1);
1231 _mm_storeu_si128(bo + i, t1);
1232
1233 cb = increment_be(cb);
1234 }
1235
1236 if (rem)
1237 {
1238 y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
1239 }
1240 y = icv_tailer(this, y, alen, len);
1241 icv_crypt(this, y, j, icv);
1242 }
1243
1244 METHOD(aead_t, encrypt, bool,
1245 private_aesni_gcm_t *this, chunk_t plain, chunk_t assoc, chunk_t iv,
1246 chunk_t *encr)
1247 {
1248 u_char *out;
1249
1250 if (!this->key || iv.len != IV_SIZE)
1251 {
1252 return FALSE;
1253 }
1254 out = plain.ptr;
1255 if (encr)
1256 {
1257 *encr = chunk_alloc(plain.len + this->icv_size);
1258 out = encr->ptr;
1259 }
1260 this->encrypt(this, plain.len, plain.ptr, out, iv.ptr,
1261 assoc.len, assoc.ptr, out + plain.len);
1262 return TRUE;
1263 }
1264
1265 METHOD(aead_t, decrypt, bool,
1266 private_aesni_gcm_t *this, chunk_t encr, chunk_t assoc, chunk_t iv,
1267 chunk_t *plain)
1268 {
1269 u_char *out, icv[this->icv_size];
1270
1271 if (!this->key || iv.len != IV_SIZE || encr.len < this->icv_size)
1272 {
1273 return FALSE;
1274 }
1275 encr.len -= this->icv_size;
1276 out = encr.ptr;
1277 if (plain)
1278 {
1279 *plain = chunk_alloc(encr.len);
1280 out = plain->ptr;
1281 }
1282 this->decrypt(this, encr.len, encr.ptr, out, iv.ptr,
1283 assoc.len, assoc.ptr, icv);
1284 return memeq_const(icv, encr.ptr + encr.len, this->icv_size);
1285 }
1286
1287 METHOD(aead_t, get_block_size, size_t,
1288 private_aesni_gcm_t *this)
1289 {
1290 return 1;
1291 }
1292
1293 METHOD(aead_t, get_icv_size, size_t,
1294 private_aesni_gcm_t *this)
1295 {
1296 return this->icv_size;
1297 }
1298
1299 METHOD(aead_t, get_iv_size, size_t,
1300 private_aesni_gcm_t *this)
1301 {
1302 return IV_SIZE;
1303 }
1304
1305 METHOD(aead_t, get_iv_gen, iv_gen_t*,
1306 private_aesni_gcm_t *this)
1307 {
1308 return this->iv_gen;
1309 }
1310
1311 METHOD(aead_t, get_key_size, size_t,
1312 private_aesni_gcm_t *this)
1313 {
1314 return this->key_size + SALT_SIZE;
1315 }
1316
1317 METHOD(aead_t, set_key, bool,
1318 private_aesni_gcm_t *this, chunk_t key)
1319 {
1320 u_int round;
1321 __m128i *ks, h;
1322
1323 if (key.len != this->key_size + SALT_SIZE)
1324 {
1325 return FALSE;
1326 }
1327
1328 memcpy(this->salt, key.ptr + key.len - SALT_SIZE, SALT_SIZE);
1329 key.len -= SALT_SIZE;
1330
1331 DESTROY_IF(this->key);
1332 this->key = aesni_key_create(TRUE, key);
1333
1334 ks = this->key->schedule;
1335 h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
1336 for (round = 1; round < this->key->rounds; round++)
1337 {
1338 h = _mm_aesenc_si128(h, ks[round]);
1339 }
1340 h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
1341
1342 this->h = h;
1343 h = swap128(h);
1344 this->hh = mult_block(h, this->h);
1345 this->hhh = mult_block(h, this->hh);
1346 this->hhhh = mult_block(h, this->hhh);
1347 this->h = swap128(this->h);
1348 this->hh = swap128(this->hh);
1349 this->hhh = swap128(this->hhh);
1350 this->hhhh = swap128(this->hhhh);
1351
1352 return TRUE;
1353 }
1354
1355 METHOD(aead_t, destroy, void,
1356 private_aesni_gcm_t *this)
1357 {
1358 DESTROY_IF(this->key);
1359 memwipe(&this->h, sizeof(this->h));
1360 memwipe(&this->hh, sizeof(this->hh));
1361 memwipe(&this->hhh, sizeof(this->hhh));
1362 memwipe(&this->hhhh, sizeof(this->hhhh));
1363 this->iv_gen->destroy(this->iv_gen);
1364 free_align(this);
1365 }
1366
1367 /**
1368 * See header
1369 */
1370 aesni_gcm_t *aesni_gcm_create(encryption_algorithm_t algo,
1371 size_t key_size, size_t salt_size)
1372 {
1373 private_aesni_gcm_t *this;
1374 size_t icv_size;
1375
1376 switch (key_size)
1377 {
1378 case 0:
1379 key_size = 16;
1380 break;
1381 case 16:
1382 case 24:
1383 case 32:
1384 break;
1385 default:
1386 return NULL;
1387 }
1388 if (salt_size && salt_size != SALT_SIZE)
1389 {
1390 /* currently not supported */
1391 return NULL;
1392 }
1393 switch (algo)
1394 {
1395 case ENCR_AES_GCM_ICV8:
1396 algo = ENCR_AES_CBC;
1397 icv_size = 8;
1398 break;
1399 case ENCR_AES_GCM_ICV12:
1400 algo = ENCR_AES_CBC;
1401 icv_size = 12;
1402 break;
1403 case ENCR_AES_GCM_ICV16:
1404 algo = ENCR_AES_CBC;
1405 icv_size = 16;
1406 break;
1407 default:
1408 return NULL;
1409 }
1410
1411 INIT_ALIGN(this, sizeof(__m128i),
1412 .public = {
1413 .aead = {
1414 .encrypt = _encrypt,
1415 .decrypt = _decrypt,
1416 .get_block_size = _get_block_size,
1417 .get_icv_size = _get_icv_size,
1418 .get_iv_size = _get_iv_size,
1419 .get_iv_gen = _get_iv_gen,
1420 .get_key_size = _get_key_size,
1421 .set_key = _set_key,
1422 .destroy = _destroy,
1423 },
1424 },
1425 .key_size = key_size,
1426 .iv_gen = iv_gen_seq_create(),
1427 .icv_size = icv_size,
1428 );
1429
1430 switch (key_size)
1431 {
1432 case 16:
1433 this->encrypt = encrypt_gcm128;
1434 this->decrypt = decrypt_gcm128;
1435 break;
1436 case 24:
1437 this->encrypt = encrypt_gcm192;
1438 this->decrypt = decrypt_gcm192;
1439 break;
1440 case 32:
1441 this->encrypt = encrypt_gcm256;
1442 this->decrypt = decrypt_gcm256;
1443 break;
1444 }
1445
1446 return &this->public;
1447 }