2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the
7 * Free Software Foundation; either version 2 of the License, or (at your
8 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 #include "aesni_gcm.h"
17 #include "aesni_key.h"
19 #include <crypto/iv/iv_gen_seq.h>
21 #include <tmmintrin.h>
25 #define SALT_SIZE (NONCE_SIZE - IV_SIZE)
30 #define GCM_CRYPT_PARALLELISM 4
32 typedef struct private_aesni_gcm_t private_aesni_gcm_t
;
35 * GCM en/decryption method type
37 typedef void (*aesni_gcm_fn_t
)(private_aesni_gcm_t
*, size_t, u_char
*, u_char
*,
38 u_char
*, size_t, u_char
*, u_char
*);
41 * Private data of an aesni_gcm_t object.
43 struct private_aesni_gcm_t
{
46 * Public aesni_gcm_t interface.
51 * Encryption key schedule
61 * Length of the integrity check value
66 * Length of the key in bytes
71 * GCM encryption function
73 aesni_gcm_fn_t encrypt
;
76 * GCM decryption function
78 aesni_gcm_fn_t decrypt
;
81 * salt to add to nonce
83 u_char salt
[SALT_SIZE
];
86 * GHASH subkey H, big-endian
91 * GHASH key H^2, big-endian
96 * GHASH key H^3, big-endian
101 * GHASH key H^4, big-endian
107 * Byte-swap a 128-bit integer
109 static inline __m128i
swap128(__m128i x
)
111 return _mm_shuffle_epi8(x
,
112 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
116 * Multiply two blocks in GF128
118 static __m128i
mult_block(__m128i h
, __m128i y
)
120 __m128i t1
, t2
, t3
, t4
, t5
, t6
;
124 t1
= _mm_clmulepi64_si128(h
, y
, 0x00);
125 t2
= _mm_clmulepi64_si128(h
, y
, 0x01);
126 t3
= _mm_clmulepi64_si128(h
, y
, 0x10);
127 t4
= _mm_clmulepi64_si128(h
, y
, 0x11);
129 t2
= _mm_xor_si128(t2
, t3
);
130 t3
= _mm_slli_si128(t2
, 8);
131 t2
= _mm_srli_si128(t2
, 8);
132 t1
= _mm_xor_si128(t1
, t3
);
133 t4
= _mm_xor_si128(t4
, t2
);
135 t5
= _mm_srli_epi32(t1
, 31);
136 t1
= _mm_slli_epi32(t1
, 1);
137 t6
= _mm_srli_epi32(t4
, 31);
138 t4
= _mm_slli_epi32(t4
, 1);
140 t3
= _mm_srli_si128(t5
, 12);
141 t6
= _mm_slli_si128(t6
, 4);
142 t5
= _mm_slli_si128(t5
, 4);
143 t1
= _mm_or_si128(t1
, t5
);
144 t4
= _mm_or_si128(t4
, t6
);
145 t4
= _mm_or_si128(t4
, t3
);
147 t5
= _mm_slli_epi32(t1
, 31);
148 t6
= _mm_slli_epi32(t1
, 30);
149 t3
= _mm_slli_epi32(t1
, 25);
151 t5
= _mm_xor_si128(t5
, t6
);
152 t5
= _mm_xor_si128(t5
, t3
);
153 t6
= _mm_srli_si128(t5
, 4);
154 t4
= _mm_xor_si128(t4
, t6
);
155 t5
= _mm_slli_si128(t5
, 12);
156 t1
= _mm_xor_si128(t1
, t5
);
157 t4
= _mm_xor_si128(t4
, t1
);
159 t5
= _mm_srli_epi32(t1
, 1);
160 t2
= _mm_srli_epi32(t1
, 2);
161 t3
= _mm_srli_epi32(t1
, 7);
162 t4
= _mm_xor_si128(t4
, t2
);
163 t4
= _mm_xor_si128(t4
, t3
);
164 t4
= _mm_xor_si128(t4
, t5
);
170 * Multiply four consecutive blocks by their respective GHASH key, XOR
172 static inline __m128i
mult4xor(__m128i h1
, __m128i h2
, __m128i h3
, __m128i h4
,
173 __m128i d1
, __m128i d2
, __m128i d3
, __m128i d4
)
175 __m128i t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, t9
;
182 t0
= _mm_clmulepi64_si128(h1
, d1
, 0x00);
183 t1
= _mm_clmulepi64_si128(h2
, d2
, 0x00);
184 t2
= _mm_clmulepi64_si128(h3
, d3
, 0x00);
185 t3
= _mm_clmulepi64_si128(h4
, d4
, 0x00);
186 t8
= _mm_xor_si128(t0
, t1
);
187 t8
= _mm_xor_si128(t8
, t2
);
188 t8
= _mm_xor_si128(t8
, t3
);
190 t4
= _mm_clmulepi64_si128(h1
, d1
, 0x11);
191 t5
= _mm_clmulepi64_si128(h2
, d2
, 0x11);
192 t6
= _mm_clmulepi64_si128(h3
, d3
, 0x11);
193 t7
= _mm_clmulepi64_si128(h4
, d4
, 0x11);
194 t9
= _mm_xor_si128(t4
, t5
);
195 t9
= _mm_xor_si128(t9
, t6
);
196 t9
= _mm_xor_si128(t9
, t7
);
198 t0
= _mm_shuffle_epi32(h1
, 78);
199 t4
= _mm_shuffle_epi32(d1
, 78);
200 t0
= _mm_xor_si128(t0
, h1
);
201 t4
= _mm_xor_si128(t4
, d1
);
202 t1
= _mm_shuffle_epi32(h2
, 78);
203 t5
= _mm_shuffle_epi32(d2
, 78);
204 t1
= _mm_xor_si128(t1
, h2
);
205 t5
= _mm_xor_si128(t5
, d2
);
206 t2
= _mm_shuffle_epi32(h3
, 78);
207 t6
= _mm_shuffle_epi32(d3
, 78);
208 t2
= _mm_xor_si128(t2
, h3
);
209 t6
= _mm_xor_si128(t6
, d3
);
210 t3
= _mm_shuffle_epi32(h4
, 78);
211 t7
= _mm_shuffle_epi32(d4
, 78);
212 t3
= _mm_xor_si128(t3
, h4
);
213 t7
= _mm_xor_si128(t7
, d4
);
215 t0
= _mm_clmulepi64_si128(t0
, t4
, 0x00);
216 t1
= _mm_clmulepi64_si128(t1
, t5
, 0x00);
217 t2
= _mm_clmulepi64_si128(t2
, t6
, 0x00);
218 t3
= _mm_clmulepi64_si128(t3
, t7
, 0x00);
219 t0
= _mm_xor_si128(t0
, t8
);
220 t0
= _mm_xor_si128(t0
, t9
);
221 t0
= _mm_xor_si128(t1
, t0
);
222 t0
= _mm_xor_si128(t2
, t0
);
224 t0
= _mm_xor_si128(t3
, t0
);
225 t4
= _mm_slli_si128(t0
, 8);
226 t0
= _mm_srli_si128(t0
, 8);
227 t3
= _mm_xor_si128(t4
, t8
);
228 t6
= _mm_xor_si128(t0
, t9
);
229 t7
= _mm_srli_epi32(t3
, 31);
230 t8
= _mm_srli_epi32(t6
, 31);
231 t3
= _mm_slli_epi32(t3
, 1);
232 t6
= _mm_slli_epi32(t6
, 1);
233 t9
= _mm_srli_si128(t7
, 12);
234 t8
= _mm_slli_si128(t8
, 4);
235 t7
= _mm_slli_si128(t7
, 4);
236 t3
= _mm_or_si128(t3
, t7
);
237 t6
= _mm_or_si128(t6
, t8
);
238 t6
= _mm_or_si128(t6
, t9
);
239 t7
= _mm_slli_epi32(t3
, 31);
240 t8
= _mm_slli_epi32(t3
, 30);
241 t9
= _mm_slli_epi32(t3
, 25);
242 t7
= _mm_xor_si128(t7
, t8
);
243 t7
= _mm_xor_si128(t7
, t9
);
244 t8
= _mm_srli_si128(t7
, 4);
245 t7
= _mm_slli_si128(t7
, 12);
246 t3
= _mm_xor_si128(t3
, t7
);
247 t2
= _mm_srli_epi32(t3
, 1);
248 t4
= _mm_srli_epi32(t3
, 2);
249 t5
= _mm_srli_epi32(t3
, 7);
250 t2
= _mm_xor_si128(t2
, t4
);
251 t2
= _mm_xor_si128(t2
, t5
);
252 t2
= _mm_xor_si128(t2
, t8
);
253 t3
= _mm_xor_si128(t3
, t2
);
254 t6
= _mm_xor_si128(t6
, t3
);
260 * GHASH on a single block
262 static __m128i
ghash(__m128i h
, __m128i y
, __m128i x
)
264 return mult_block(h
, _mm_xor_si128(y
, x
));
268 * Start constructing the ICV for the associated data
270 static __m128i
icv_header(private_aesni_gcm_t
*this, void *assoc
, size_t alen
)
272 u_int blocks
, pblocks
, rem
, i
;
273 __m128i h1
, h2
, h3
, h4
, d1
, d2
, d3
, d4
;
274 __m128i y
, last
, *ab
;
281 y
= _mm_setzero_si128();
283 blocks
= alen
/ AES_BLOCK_SIZE
;
284 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
285 rem
= alen
% AES_BLOCK_SIZE
;
286 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
288 d1
= _mm_loadu_si128(ab
+ i
+ 0);
289 d2
= _mm_loadu_si128(ab
+ i
+ 1);
290 d3
= _mm_loadu_si128(ab
+ i
+ 2);
291 d4
= _mm_loadu_si128(ab
+ i
+ 3);
292 y
= _mm_xor_si128(y
, d1
);
293 y
= mult4xor(h1
, h2
, h3
, h4
, y
, d2
, d3
, d4
);
295 for (i
= pblocks
; i
< blocks
; i
++)
297 y
= ghash(this->h
, y
, _mm_loadu_si128(ab
+ i
));
301 last
= _mm_setzero_si128();
302 memcpy(&last
, ab
+ blocks
, rem
);
304 y
= ghash(this->h
, y
, last
);
311 * Complete the ICV by hashing a assoc/data length block
313 static __m128i
icv_tailer(private_aesni_gcm_t
*this, __m128i y
,
314 size_t alen
, size_t dlen
)
318 htoun64(&b
, alen
* 8);
319 htoun64((u_char
*)&b
+ sizeof(u_int64_t
), dlen
* 8);
321 return ghash(this->h
, y
, b
);
325 * En-/Decrypt the ICV, trim and store it
327 static void icv_crypt(private_aesni_gcm_t
*this, __m128i y
, __m128i j
,
333 ks
= this->key
->schedule
;
334 t
= _mm_xor_si128(j
, ks
[0]);
335 for (round
= 1; round
< this->key
->rounds
; round
++)
337 t
= _mm_aesenc_si128(t
, ks
[round
]);
339 t
= _mm_aesenclast_si128(t
, ks
[this->key
->rounds
]);
341 t
= _mm_xor_si128(y
, t
);
343 _mm_storeu_si128(&b
, t
);
344 memcpy(icv
, &b
, this->icv_size
);
348 * Do big-endian increment on x
350 static inline __m128i
increment_be(__m128i x
)
353 x
= _mm_add_epi64(x
, _mm_set_epi32(0, 0, 0, 1));
360 * Generate the block J0
362 static inline __m128i
create_j(private_aesni_gcm_t
*this, u_char
*iv
)
364 u_char j
[AES_BLOCK_SIZE
];
366 memcpy(j
, this->salt
, SALT_SIZE
);
367 memcpy(j
+ SALT_SIZE
, iv
, IV_SIZE
);
368 htoun32(j
+ SALT_SIZE
+ IV_SIZE
, 1);
370 return _mm_loadu_si128((__m128i
*)j
);
374 * Encrypt a remaining incomplete block, return updated Y
376 static __m128i
encrypt_gcm_rem(private_aesni_gcm_t
*this, u_int rem
,
377 void *in
, void *out
, __m128i cb
, __m128i y
)
382 memset(&b
, 0, sizeof(b
));
385 ks
= this->key
->schedule
;
386 t
= _mm_xor_si128(cb
, ks
[0]);
387 for (round
= 1; round
< this->key
->rounds
; round
++)
389 t
= _mm_aesenc_si128(t
, ks
[round
]);
391 t
= _mm_aesenclast_si128(t
, ks
[this->key
->rounds
]);
392 b
= _mm_xor_si128(t
, b
);
394 memcpy(out
, &b
, rem
);
396 memset((u_char
*)&b
+ rem
, 0, AES_BLOCK_SIZE
- rem
);
397 return ghash(this->h
, y
, b
);
401 * Decrypt a remaining incomplete block, return updated Y
403 static __m128i
decrypt_gcm_rem(private_aesni_gcm_t
*this, u_int rem
,
404 void *in
, void *out
, __m128i cb
, __m128i y
)
409 memset(&b
, 0, sizeof(b
));
412 y
= ghash(this->h
, y
, b
);
414 ks
= this->key
->schedule
;
415 t
= _mm_xor_si128(cb
, ks
[0]);
416 for (round
= 1; round
< this->key
->rounds
; round
++)
418 t
= _mm_aesenc_si128(t
, ks
[round
]);
420 t
= _mm_aesenclast_si128(t
, ks
[this->key
->rounds
]);
421 b
= _mm_xor_si128(t
, b
);
423 memcpy(out
, &b
, rem
);
429 * AES-128 GCM encryption/ICV generation
431 static void encrypt_gcm128(private_aesni_gcm_t
*this,
432 size_t len
, u_char
*in
, u_char
*out
, u_char
*iv
,
433 size_t alen
, u_char
*assoc
, u_char
*icv
)
435 __m128i d1
, d2
, d3
, d4
, t1
, t2
, t3
, t4
;
436 __m128i
*ks
, y
, j
, cb
, *bi
, *bo
;
437 u_int blocks
, pblocks
, rem
, i
;
439 j
= create_j(this, iv
);
440 cb
= increment_be(j
);
441 y
= icv_header(this, assoc
, alen
);
442 blocks
= len
/ AES_BLOCK_SIZE
;
443 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
444 rem
= len
% AES_BLOCK_SIZE
;
448 ks
= this->key
->schedule
;
450 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
452 d1
= _mm_loadu_si128(bi
+ i
+ 0);
453 d2
= _mm_loadu_si128(bi
+ i
+ 1);
454 d3
= _mm_loadu_si128(bi
+ i
+ 2);
455 d4
= _mm_loadu_si128(bi
+ i
+ 3);
457 t1
= _mm_xor_si128(cb
, ks
[0]);
458 cb
= increment_be(cb
);
459 t2
= _mm_xor_si128(cb
, ks
[0]);
460 cb
= increment_be(cb
);
461 t3
= _mm_xor_si128(cb
, ks
[0]);
462 cb
= increment_be(cb
);
463 t4
= _mm_xor_si128(cb
, ks
[0]);
464 cb
= increment_be(cb
);
466 t1
= _mm_aesenc_si128(t1
, ks
[1]);
467 t2
= _mm_aesenc_si128(t2
, ks
[1]);
468 t3
= _mm_aesenc_si128(t3
, ks
[1]);
469 t4
= _mm_aesenc_si128(t4
, ks
[1]);
470 t1
= _mm_aesenc_si128(t1
, ks
[2]);
471 t2
= _mm_aesenc_si128(t2
, ks
[2]);
472 t3
= _mm_aesenc_si128(t3
, ks
[2]);
473 t4
= _mm_aesenc_si128(t4
, ks
[2]);
474 t1
= _mm_aesenc_si128(t1
, ks
[3]);
475 t2
= _mm_aesenc_si128(t2
, ks
[3]);
476 t3
= _mm_aesenc_si128(t3
, ks
[3]);
477 t4
= _mm_aesenc_si128(t4
, ks
[3]);
478 t1
= _mm_aesenc_si128(t1
, ks
[4]);
479 t2
= _mm_aesenc_si128(t2
, ks
[4]);
480 t3
= _mm_aesenc_si128(t3
, ks
[4]);
481 t4
= _mm_aesenc_si128(t4
, ks
[4]);
482 t1
= _mm_aesenc_si128(t1
, ks
[5]);
483 t2
= _mm_aesenc_si128(t2
, ks
[5]);
484 t3
= _mm_aesenc_si128(t3
, ks
[5]);
485 t4
= _mm_aesenc_si128(t4
, ks
[5]);
486 t1
= _mm_aesenc_si128(t1
, ks
[6]);
487 t2
= _mm_aesenc_si128(t2
, ks
[6]);
488 t3
= _mm_aesenc_si128(t3
, ks
[6]);
489 t4
= _mm_aesenc_si128(t4
, ks
[6]);
490 t1
= _mm_aesenc_si128(t1
, ks
[7]);
491 t2
= _mm_aesenc_si128(t2
, ks
[7]);
492 t3
= _mm_aesenc_si128(t3
, ks
[7]);
493 t4
= _mm_aesenc_si128(t4
, ks
[7]);
494 t1
= _mm_aesenc_si128(t1
, ks
[8]);
495 t2
= _mm_aesenc_si128(t2
, ks
[8]);
496 t3
= _mm_aesenc_si128(t3
, ks
[8]);
497 t4
= _mm_aesenc_si128(t4
, ks
[8]);
498 t1
= _mm_aesenc_si128(t1
, ks
[9]);
499 t2
= _mm_aesenc_si128(t2
, ks
[9]);
500 t3
= _mm_aesenc_si128(t3
, ks
[9]);
501 t4
= _mm_aesenc_si128(t4
, ks
[9]);
503 t1
= _mm_aesenclast_si128(t1
, ks
[10]);
504 t2
= _mm_aesenclast_si128(t2
, ks
[10]);
505 t3
= _mm_aesenclast_si128(t3
, ks
[10]);
506 t4
= _mm_aesenclast_si128(t4
, ks
[10]);
508 t1
= _mm_xor_si128(t1
, d1
);
509 t2
= _mm_xor_si128(t2
, d2
);
510 t3
= _mm_xor_si128(t3
, d3
);
511 t4
= _mm_xor_si128(t4
, d4
);
513 y
= _mm_xor_si128(y
, t1
);
514 y
= mult4xor(this->hhhh
, this->hhh
, this->hh
, this->h
, y
, t2
, t3
, t4
);
516 _mm_storeu_si128(bo
+ i
+ 0, t1
);
517 _mm_storeu_si128(bo
+ i
+ 1, t2
);
518 _mm_storeu_si128(bo
+ i
+ 2, t3
);
519 _mm_storeu_si128(bo
+ i
+ 3, t4
);
522 for (i
= pblocks
; i
< blocks
; i
++)
524 d1
= _mm_loadu_si128(bi
+ i
);
526 t1
= _mm_xor_si128(cb
, ks
[0]);
527 t1
= _mm_aesenc_si128(t1
, ks
[1]);
528 t1
= _mm_aesenc_si128(t1
, ks
[2]);
529 t1
= _mm_aesenc_si128(t1
, ks
[3]);
530 t1
= _mm_aesenc_si128(t1
, ks
[4]);
531 t1
= _mm_aesenc_si128(t1
, ks
[5]);
532 t1
= _mm_aesenc_si128(t1
, ks
[6]);
533 t1
= _mm_aesenc_si128(t1
, ks
[7]);
534 t1
= _mm_aesenc_si128(t1
, ks
[8]);
535 t1
= _mm_aesenc_si128(t1
, ks
[9]);
536 t1
= _mm_aesenclast_si128(t1
, ks
[10]);
538 t1
= _mm_xor_si128(t1
, d1
);
539 _mm_storeu_si128(bo
+ i
, t1
);
541 y
= ghash(this->h
, y
, t1
);
543 cb
= increment_be(cb
);
548 y
= encrypt_gcm_rem(this, rem
, bi
+ blocks
, bo
+ blocks
, cb
, y
);
550 y
= icv_tailer(this, y
, alen
, len
);
551 icv_crypt(this, y
, j
, icv
);
555 * AES-128 GCM decryption/ICV generation
557 static void decrypt_gcm128(private_aesni_gcm_t
*this,
558 size_t len
, u_char
*in
, u_char
*out
, u_char
*iv
,
559 size_t alen
, u_char
*assoc
, u_char
*icv
)
561 __m128i d1
, d2
, d3
, d4
, t1
, t2
, t3
, t4
;
562 __m128i
*ks
, y
, j
, cb
, *bi
, *bo
;
563 u_int blocks
, pblocks
, rem
, i
;
565 j
= create_j(this, iv
);
566 cb
= increment_be(j
);
567 y
= icv_header(this, assoc
, alen
);
568 blocks
= len
/ AES_BLOCK_SIZE
;
569 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
570 rem
= len
% AES_BLOCK_SIZE
;
574 ks
= this->key
->schedule
;
576 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
578 d1
= _mm_loadu_si128(bi
+ i
+ 0);
579 d2
= _mm_loadu_si128(bi
+ i
+ 1);
580 d3
= _mm_loadu_si128(bi
+ i
+ 2);
581 d4
= _mm_loadu_si128(bi
+ i
+ 3);
583 y
= _mm_xor_si128(y
, d1
);
584 y
= mult4xor(this->hhhh
, this->hhh
, this->hh
, this->h
, y
, d2
, d3
, d4
);
586 t1
= _mm_xor_si128(cb
, ks
[0]);
587 cb
= increment_be(cb
);
588 t2
= _mm_xor_si128(cb
, ks
[0]);
589 cb
= increment_be(cb
);
590 t3
= _mm_xor_si128(cb
, ks
[0]);
591 cb
= increment_be(cb
);
592 t4
= _mm_xor_si128(cb
, ks
[0]);
593 cb
= increment_be(cb
);
595 t1
= _mm_aesenc_si128(t1
, ks
[1]);
596 t2
= _mm_aesenc_si128(t2
, ks
[1]);
597 t3
= _mm_aesenc_si128(t3
, ks
[1]);
598 t4
= _mm_aesenc_si128(t4
, ks
[1]);
599 t1
= _mm_aesenc_si128(t1
, ks
[2]);
600 t2
= _mm_aesenc_si128(t2
, ks
[2]);
601 t3
= _mm_aesenc_si128(t3
, ks
[2]);
602 t4
= _mm_aesenc_si128(t4
, ks
[2]);
603 t1
= _mm_aesenc_si128(t1
, ks
[3]);
604 t2
= _mm_aesenc_si128(t2
, ks
[3]);
605 t3
= _mm_aesenc_si128(t3
, ks
[3]);
606 t4
= _mm_aesenc_si128(t4
, ks
[3]);
607 t1
= _mm_aesenc_si128(t1
, ks
[4]);
608 t2
= _mm_aesenc_si128(t2
, ks
[4]);
609 t3
= _mm_aesenc_si128(t3
, ks
[4]);
610 t4
= _mm_aesenc_si128(t4
, ks
[4]);
611 t1
= _mm_aesenc_si128(t1
, ks
[5]);
612 t2
= _mm_aesenc_si128(t2
, ks
[5]);
613 t3
= _mm_aesenc_si128(t3
, ks
[5]);
614 t4
= _mm_aesenc_si128(t4
, ks
[5]);
615 t1
= _mm_aesenc_si128(t1
, ks
[6]);
616 t2
= _mm_aesenc_si128(t2
, ks
[6]);
617 t3
= _mm_aesenc_si128(t3
, ks
[6]);
618 t4
= _mm_aesenc_si128(t4
, ks
[6]);
619 t1
= _mm_aesenc_si128(t1
, ks
[7]);
620 t2
= _mm_aesenc_si128(t2
, ks
[7]);
621 t3
= _mm_aesenc_si128(t3
, ks
[7]);
622 t4
= _mm_aesenc_si128(t4
, ks
[7]);
623 t1
= _mm_aesenc_si128(t1
, ks
[8]);
624 t2
= _mm_aesenc_si128(t2
, ks
[8]);
625 t3
= _mm_aesenc_si128(t3
, ks
[8]);
626 t4
= _mm_aesenc_si128(t4
, ks
[8]);
627 t1
= _mm_aesenc_si128(t1
, ks
[9]);
628 t2
= _mm_aesenc_si128(t2
, ks
[9]);
629 t3
= _mm_aesenc_si128(t3
, ks
[9]);
630 t4
= _mm_aesenc_si128(t4
, ks
[9]);
632 t1
= _mm_aesenclast_si128(t1
, ks
[10]);
633 t2
= _mm_aesenclast_si128(t2
, ks
[10]);
634 t3
= _mm_aesenclast_si128(t3
, ks
[10]);
635 t4
= _mm_aesenclast_si128(t4
, ks
[10]);
637 t1
= _mm_xor_si128(t1
, d1
);
638 t2
= _mm_xor_si128(t2
, d2
);
639 t3
= _mm_xor_si128(t3
, d3
);
640 t4
= _mm_xor_si128(t4
, d4
);
642 _mm_storeu_si128(bo
+ i
+ 0, t1
);
643 _mm_storeu_si128(bo
+ i
+ 1, t2
);
644 _mm_storeu_si128(bo
+ i
+ 2, t3
);
645 _mm_storeu_si128(bo
+ i
+ 3, t4
);
648 for (i
= pblocks
; i
< blocks
; i
++)
650 d1
= _mm_loadu_si128(bi
+ i
);
652 y
= ghash(this->h
, y
, d1
);
654 t1
= _mm_xor_si128(cb
, ks
[0]);
655 t1
= _mm_aesenc_si128(t1
, ks
[1]);
656 t1
= _mm_aesenc_si128(t1
, ks
[2]);
657 t1
= _mm_aesenc_si128(t1
, ks
[3]);
658 t1
= _mm_aesenc_si128(t1
, ks
[4]);
659 t1
= _mm_aesenc_si128(t1
, ks
[5]);
660 t1
= _mm_aesenc_si128(t1
, ks
[6]);
661 t1
= _mm_aesenc_si128(t1
, ks
[7]);
662 t1
= _mm_aesenc_si128(t1
, ks
[8]);
663 t1
= _mm_aesenc_si128(t1
, ks
[9]);
664 t1
= _mm_aesenclast_si128(t1
, ks
[10]);
666 t1
= _mm_xor_si128(t1
, d1
);
667 _mm_storeu_si128(bo
+ i
, t1
);
669 cb
= increment_be(cb
);
674 y
= decrypt_gcm_rem(this, rem
, bi
+ blocks
, bo
+ blocks
, cb
, y
);
676 y
= icv_tailer(this, y
, alen
, len
);
677 icv_crypt(this, y
, j
, icv
);
681 * AES-192 GCM encryption/ICV generation
683 static void encrypt_gcm192(private_aesni_gcm_t
*this,
684 size_t len
, u_char
*in
, u_char
*out
, u_char
*iv
,
685 size_t alen
, u_char
*assoc
, u_char
*icv
)
687 __m128i d1
, d2
, d3
, d4
, t1
, t2
, t3
, t4
;
688 __m128i
*ks
, y
, j
, cb
, *bi
, *bo
;
689 u_int blocks
, pblocks
, rem
, i
;
691 j
= create_j(this, iv
);
692 cb
= increment_be(j
);
693 y
= icv_header(this, assoc
, alen
);
694 blocks
= len
/ AES_BLOCK_SIZE
;
695 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
696 rem
= len
% AES_BLOCK_SIZE
;
700 ks
= this->key
->schedule
;
702 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
704 d1
= _mm_loadu_si128(bi
+ i
+ 0);
705 d2
= _mm_loadu_si128(bi
+ i
+ 1);
706 d3
= _mm_loadu_si128(bi
+ i
+ 2);
707 d4
= _mm_loadu_si128(bi
+ i
+ 3);
709 t1
= _mm_xor_si128(cb
, ks
[0]);
710 cb
= increment_be(cb
);
711 t2
= _mm_xor_si128(cb
, ks
[0]);
712 cb
= increment_be(cb
);
713 t3
= _mm_xor_si128(cb
, ks
[0]);
714 cb
= increment_be(cb
);
715 t4
= _mm_xor_si128(cb
, ks
[0]);
716 cb
= increment_be(cb
);
718 t1
= _mm_aesenc_si128(t1
, ks
[1]);
719 t2
= _mm_aesenc_si128(t2
, ks
[1]);
720 t3
= _mm_aesenc_si128(t3
, ks
[1]);
721 t4
= _mm_aesenc_si128(t4
, ks
[1]);
722 t1
= _mm_aesenc_si128(t1
, ks
[2]);
723 t2
= _mm_aesenc_si128(t2
, ks
[2]);
724 t3
= _mm_aesenc_si128(t3
, ks
[2]);
725 t4
= _mm_aesenc_si128(t4
, ks
[2]);
726 t1
= _mm_aesenc_si128(t1
, ks
[3]);
727 t2
= _mm_aesenc_si128(t2
, ks
[3]);
728 t3
= _mm_aesenc_si128(t3
, ks
[3]);
729 t4
= _mm_aesenc_si128(t4
, ks
[3]);
730 t1
= _mm_aesenc_si128(t1
, ks
[4]);
731 t2
= _mm_aesenc_si128(t2
, ks
[4]);
732 t3
= _mm_aesenc_si128(t3
, ks
[4]);
733 t4
= _mm_aesenc_si128(t4
, ks
[4]);
734 t1
= _mm_aesenc_si128(t1
, ks
[5]);
735 t2
= _mm_aesenc_si128(t2
, ks
[5]);
736 t3
= _mm_aesenc_si128(t3
, ks
[5]);
737 t4
= _mm_aesenc_si128(t4
, ks
[5]);
738 t1
= _mm_aesenc_si128(t1
, ks
[6]);
739 t2
= _mm_aesenc_si128(t2
, ks
[6]);
740 t3
= _mm_aesenc_si128(t3
, ks
[6]);
741 t4
= _mm_aesenc_si128(t4
, ks
[6]);
742 t1
= _mm_aesenc_si128(t1
, ks
[7]);
743 t2
= _mm_aesenc_si128(t2
, ks
[7]);
744 t3
= _mm_aesenc_si128(t3
, ks
[7]);
745 t4
= _mm_aesenc_si128(t4
, ks
[7]);
746 t1
= _mm_aesenc_si128(t1
, ks
[8]);
747 t2
= _mm_aesenc_si128(t2
, ks
[8]);
748 t3
= _mm_aesenc_si128(t3
, ks
[8]);
749 t4
= _mm_aesenc_si128(t4
, ks
[8]);
750 t1
= _mm_aesenc_si128(t1
, ks
[9]);
751 t2
= _mm_aesenc_si128(t2
, ks
[9]);
752 t3
= _mm_aesenc_si128(t3
, ks
[9]);
753 t4
= _mm_aesenc_si128(t4
, ks
[9]);
754 t1
= _mm_aesenc_si128(t1
, ks
[10]);
755 t2
= _mm_aesenc_si128(t2
, ks
[10]);
756 t3
= _mm_aesenc_si128(t3
, ks
[10]);
757 t4
= _mm_aesenc_si128(t4
, ks
[10]);
758 t1
= _mm_aesenc_si128(t1
, ks
[11]);
759 t2
= _mm_aesenc_si128(t2
, ks
[11]);
760 t3
= _mm_aesenc_si128(t3
, ks
[11]);
761 t4
= _mm_aesenc_si128(t4
, ks
[11]);
763 t1
= _mm_aesenclast_si128(t1
, ks
[12]);
764 t2
= _mm_aesenclast_si128(t2
, ks
[12]);
765 t3
= _mm_aesenclast_si128(t3
, ks
[12]);
766 t4
= _mm_aesenclast_si128(t4
, ks
[12]);
768 t1
= _mm_xor_si128(t1
, d1
);
769 t2
= _mm_xor_si128(t2
, d2
);
770 t3
= _mm_xor_si128(t3
, d3
);
771 t4
= _mm_xor_si128(t4
, d4
);
773 y
= _mm_xor_si128(y
, t1
);
774 y
= mult4xor(this->hhhh
, this->hhh
, this->hh
, this->h
, y
, t2
, t3
, t4
);
776 _mm_storeu_si128(bo
+ i
+ 0, t1
);
777 _mm_storeu_si128(bo
+ i
+ 1, t2
);
778 _mm_storeu_si128(bo
+ i
+ 2, t3
);
779 _mm_storeu_si128(bo
+ i
+ 3, t4
);
782 for (i
= pblocks
; i
< blocks
; i
++)
784 d1
= _mm_loadu_si128(bi
+ i
);
786 t1
= _mm_xor_si128(cb
, ks
[0]);
787 t1
= _mm_aesenc_si128(t1
, ks
[1]);
788 t1
= _mm_aesenc_si128(t1
, ks
[2]);
789 t1
= _mm_aesenc_si128(t1
, ks
[3]);
790 t1
= _mm_aesenc_si128(t1
, ks
[4]);
791 t1
= _mm_aesenc_si128(t1
, ks
[5]);
792 t1
= _mm_aesenc_si128(t1
, ks
[6]);
793 t1
= _mm_aesenc_si128(t1
, ks
[7]);
794 t1
= _mm_aesenc_si128(t1
, ks
[8]);
795 t1
= _mm_aesenc_si128(t1
, ks
[9]);
796 t1
= _mm_aesenc_si128(t1
, ks
[10]);
797 t1
= _mm_aesenc_si128(t1
, ks
[11]);
798 t1
= _mm_aesenclast_si128(t1
, ks
[12]);
800 t1
= _mm_xor_si128(t1
, d1
);
801 _mm_storeu_si128(bo
+ i
, t1
);
803 y
= ghash(this->h
, y
, t1
);
805 cb
= increment_be(cb
);
810 y
= encrypt_gcm_rem(this, rem
, bi
+ blocks
, bo
+ blocks
, cb
, y
);
812 y
= icv_tailer(this, y
, alen
, len
);
813 icv_crypt(this, y
, j
, icv
);
817 * AES-192 GCM decryption/ICV generation
819 static void decrypt_gcm192(private_aesni_gcm_t
*this,
820 size_t len
, u_char
*in
, u_char
*out
, u_char
*iv
,
821 size_t alen
, u_char
*assoc
, u_char
*icv
)
823 __m128i d1
, d2
, d3
, d4
, t1
, t2
, t3
, t4
;
824 __m128i
*ks
, y
, j
, cb
, *bi
, *bo
;
825 u_int blocks
, pblocks
, rem
, i
;
827 j
= create_j(this, iv
);
828 cb
= increment_be(j
);
829 y
= icv_header(this, assoc
, alen
);
830 blocks
= len
/ AES_BLOCK_SIZE
;
831 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
832 rem
= len
% AES_BLOCK_SIZE
;
836 ks
= this->key
->schedule
;
838 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
840 d1
= _mm_loadu_si128(bi
+ i
+ 0);
841 d2
= _mm_loadu_si128(bi
+ i
+ 1);
842 d3
= _mm_loadu_si128(bi
+ i
+ 2);
843 d4
= _mm_loadu_si128(bi
+ i
+ 3);
845 y
= _mm_xor_si128(y
, d1
);
846 y
= mult4xor(this->hhhh
, this->hhh
, this->hh
, this->h
, y
, d2
, d3
, d4
);
848 t1
= _mm_xor_si128(cb
, ks
[0]);
849 cb
= increment_be(cb
);
850 t2
= _mm_xor_si128(cb
, ks
[0]);
851 cb
= increment_be(cb
);
852 t3
= _mm_xor_si128(cb
, ks
[0]);
853 cb
= increment_be(cb
);
854 t4
= _mm_xor_si128(cb
, ks
[0]);
855 cb
= increment_be(cb
);
857 t1
= _mm_aesenc_si128(t1
, ks
[1]);
858 t2
= _mm_aesenc_si128(t2
, ks
[1]);
859 t3
= _mm_aesenc_si128(t3
, ks
[1]);
860 t4
= _mm_aesenc_si128(t4
, ks
[1]);
861 t1
= _mm_aesenc_si128(t1
, ks
[2]);
862 t2
= _mm_aesenc_si128(t2
, ks
[2]);
863 t3
= _mm_aesenc_si128(t3
, ks
[2]);
864 t4
= _mm_aesenc_si128(t4
, ks
[2]);
865 t1
= _mm_aesenc_si128(t1
, ks
[3]);
866 t2
= _mm_aesenc_si128(t2
, ks
[3]);
867 t3
= _mm_aesenc_si128(t3
, ks
[3]);
868 t4
= _mm_aesenc_si128(t4
, ks
[3]);
869 t1
= _mm_aesenc_si128(t1
, ks
[4]);
870 t2
= _mm_aesenc_si128(t2
, ks
[4]);
871 t3
= _mm_aesenc_si128(t3
, ks
[4]);
872 t4
= _mm_aesenc_si128(t4
, ks
[4]);
873 t1
= _mm_aesenc_si128(t1
, ks
[5]);
874 t2
= _mm_aesenc_si128(t2
, ks
[5]);
875 t3
= _mm_aesenc_si128(t3
, ks
[5]);
876 t4
= _mm_aesenc_si128(t4
, ks
[5]);
877 t1
= _mm_aesenc_si128(t1
, ks
[6]);
878 t2
= _mm_aesenc_si128(t2
, ks
[6]);
879 t3
= _mm_aesenc_si128(t3
, ks
[6]);
880 t4
= _mm_aesenc_si128(t4
, ks
[6]);
881 t1
= _mm_aesenc_si128(t1
, ks
[7]);
882 t2
= _mm_aesenc_si128(t2
, ks
[7]);
883 t3
= _mm_aesenc_si128(t3
, ks
[7]);
884 t4
= _mm_aesenc_si128(t4
, ks
[7]);
885 t1
= _mm_aesenc_si128(t1
, ks
[8]);
886 t2
= _mm_aesenc_si128(t2
, ks
[8]);
887 t3
= _mm_aesenc_si128(t3
, ks
[8]);
888 t4
= _mm_aesenc_si128(t4
, ks
[8]);
889 t1
= _mm_aesenc_si128(t1
, ks
[9]);
890 t2
= _mm_aesenc_si128(t2
, ks
[9]);
891 t3
= _mm_aesenc_si128(t3
, ks
[9]);
892 t4
= _mm_aesenc_si128(t4
, ks
[9]);
893 t1
= _mm_aesenc_si128(t1
, ks
[10]);
894 t2
= _mm_aesenc_si128(t2
, ks
[10]);
895 t3
= _mm_aesenc_si128(t3
, ks
[10]);
896 t4
= _mm_aesenc_si128(t4
, ks
[10]);
897 t1
= _mm_aesenc_si128(t1
, ks
[11]);
898 t2
= _mm_aesenc_si128(t2
, ks
[11]);
899 t3
= _mm_aesenc_si128(t3
, ks
[11]);
900 t4
= _mm_aesenc_si128(t4
, ks
[11]);
902 t1
= _mm_aesenclast_si128(t1
, ks
[12]);
903 t2
= _mm_aesenclast_si128(t2
, ks
[12]);
904 t3
= _mm_aesenclast_si128(t3
, ks
[12]);
905 t4
= _mm_aesenclast_si128(t4
, ks
[12]);
907 t1
= _mm_xor_si128(t1
, d1
);
908 t2
= _mm_xor_si128(t2
, d2
);
909 t3
= _mm_xor_si128(t3
, d3
);
910 t4
= _mm_xor_si128(t4
, d4
);
912 _mm_storeu_si128(bo
+ i
+ 0, t1
);
913 _mm_storeu_si128(bo
+ i
+ 1, t2
);
914 _mm_storeu_si128(bo
+ i
+ 2, t3
);
915 _mm_storeu_si128(bo
+ i
+ 3, t4
);
918 for (i
= pblocks
; i
< blocks
; i
++)
920 d1
= _mm_loadu_si128(bi
+ i
);
922 y
= ghash(this->h
, y
, d1
);
924 t1
= _mm_xor_si128(cb
, ks
[0]);
925 t1
= _mm_aesenc_si128(t1
, ks
[1]);
926 t1
= _mm_aesenc_si128(t1
, ks
[2]);
927 t1
= _mm_aesenc_si128(t1
, ks
[3]);
928 t1
= _mm_aesenc_si128(t1
, ks
[4]);
929 t1
= _mm_aesenc_si128(t1
, ks
[5]);
930 t1
= _mm_aesenc_si128(t1
, ks
[6]);
931 t1
= _mm_aesenc_si128(t1
, ks
[7]);
932 t1
= _mm_aesenc_si128(t1
, ks
[8]);
933 t1
= _mm_aesenc_si128(t1
, ks
[9]);
934 t1
= _mm_aesenc_si128(t1
, ks
[10]);
935 t1
= _mm_aesenc_si128(t1
, ks
[11]);
936 t1
= _mm_aesenclast_si128(t1
, ks
[12]);
938 t1
= _mm_xor_si128(t1
, d1
);
939 _mm_storeu_si128(bo
+ i
, t1
);
941 cb
= increment_be(cb
);
946 y
= decrypt_gcm_rem(this, rem
, bi
+ blocks
, bo
+ blocks
, cb
, y
);
948 y
= icv_tailer(this, y
, alen
, len
);
949 icv_crypt(this, y
, j
, icv
);
953 * AES-256 GCM encryption/ICV generation
955 static void encrypt_gcm256(private_aesni_gcm_t
*this,
956 size_t len
, u_char
*in
, u_char
*out
, u_char
*iv
,
957 size_t alen
, u_char
*assoc
, u_char
*icv
)
959 __m128i d1
, d2
, d3
, d4
, t1
, t2
, t3
, t4
;
960 __m128i
*ks
, y
, j
, cb
, *bi
, *bo
;
961 u_int blocks
, pblocks
, rem
, i
;
963 j
= create_j(this, iv
);
964 cb
= increment_be(j
);
965 y
= icv_header(this, assoc
, alen
);
966 blocks
= len
/ AES_BLOCK_SIZE
;
967 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
968 rem
= len
% AES_BLOCK_SIZE
;
972 ks
= this->key
->schedule
;
974 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
976 d1
= _mm_loadu_si128(bi
+ i
+ 0);
977 d2
= _mm_loadu_si128(bi
+ i
+ 1);
978 d3
= _mm_loadu_si128(bi
+ i
+ 2);
979 d4
= _mm_loadu_si128(bi
+ i
+ 3);
981 t1
= _mm_xor_si128(cb
, ks
[0]);
982 cb
= increment_be(cb
);
983 t2
= _mm_xor_si128(cb
, ks
[0]);
984 cb
= increment_be(cb
);
985 t3
= _mm_xor_si128(cb
, ks
[0]);
986 cb
= increment_be(cb
);
987 t4
= _mm_xor_si128(cb
, ks
[0]);
988 cb
= increment_be(cb
);
990 t1
= _mm_aesenc_si128(t1
, ks
[1]);
991 t2
= _mm_aesenc_si128(t2
, ks
[1]);
992 t3
= _mm_aesenc_si128(t3
, ks
[1]);
993 t4
= _mm_aesenc_si128(t4
, ks
[1]);
994 t1
= _mm_aesenc_si128(t1
, ks
[2]);
995 t2
= _mm_aesenc_si128(t2
, ks
[2]);
996 t3
= _mm_aesenc_si128(t3
, ks
[2]);
997 t4
= _mm_aesenc_si128(t4
, ks
[2]);
998 t1
= _mm_aesenc_si128(t1
, ks
[3]);
999 t2
= _mm_aesenc_si128(t2
, ks
[3]);
1000 t3
= _mm_aesenc_si128(t3
, ks
[3]);
1001 t4
= _mm_aesenc_si128(t4
, ks
[3]);
1002 t1
= _mm_aesenc_si128(t1
, ks
[4]);
1003 t2
= _mm_aesenc_si128(t2
, ks
[4]);
1004 t3
= _mm_aesenc_si128(t3
, ks
[4]);
1005 t4
= _mm_aesenc_si128(t4
, ks
[4]);
1006 t1
= _mm_aesenc_si128(t1
, ks
[5]);
1007 t2
= _mm_aesenc_si128(t2
, ks
[5]);
1008 t3
= _mm_aesenc_si128(t3
, ks
[5]);
1009 t4
= _mm_aesenc_si128(t4
, ks
[5]);
1010 t1
= _mm_aesenc_si128(t1
, ks
[6]);
1011 t2
= _mm_aesenc_si128(t2
, ks
[6]);
1012 t3
= _mm_aesenc_si128(t3
, ks
[6]);
1013 t4
= _mm_aesenc_si128(t4
, ks
[6]);
1014 t1
= _mm_aesenc_si128(t1
, ks
[7]);
1015 t2
= _mm_aesenc_si128(t2
, ks
[7]);
1016 t3
= _mm_aesenc_si128(t3
, ks
[7]);
1017 t4
= _mm_aesenc_si128(t4
, ks
[7]);
1018 t1
= _mm_aesenc_si128(t1
, ks
[8]);
1019 t2
= _mm_aesenc_si128(t2
, ks
[8]);
1020 t3
= _mm_aesenc_si128(t3
, ks
[8]);
1021 t4
= _mm_aesenc_si128(t4
, ks
[8]);
1022 t1
= _mm_aesenc_si128(t1
, ks
[9]);
1023 t2
= _mm_aesenc_si128(t2
, ks
[9]);
1024 t3
= _mm_aesenc_si128(t3
, ks
[9]);
1025 t4
= _mm_aesenc_si128(t4
, ks
[9]);
1026 t1
= _mm_aesenc_si128(t1
, ks
[10]);
1027 t2
= _mm_aesenc_si128(t2
, ks
[10]);
1028 t3
= _mm_aesenc_si128(t3
, ks
[10]);
1029 t4
= _mm_aesenc_si128(t4
, ks
[10]);
1030 t1
= _mm_aesenc_si128(t1
, ks
[11]);
1031 t2
= _mm_aesenc_si128(t2
, ks
[11]);
1032 t3
= _mm_aesenc_si128(t3
, ks
[11]);
1033 t4
= _mm_aesenc_si128(t4
, ks
[11]);
1034 t1
= _mm_aesenc_si128(t1
, ks
[12]);
1035 t2
= _mm_aesenc_si128(t2
, ks
[12]);
1036 t3
= _mm_aesenc_si128(t3
, ks
[12]);
1037 t4
= _mm_aesenc_si128(t4
, ks
[12]);
1038 t1
= _mm_aesenc_si128(t1
, ks
[13]);
1039 t2
= _mm_aesenc_si128(t2
, ks
[13]);
1040 t3
= _mm_aesenc_si128(t3
, ks
[13]);
1041 t4
= _mm_aesenc_si128(t4
, ks
[13]);
1043 t1
= _mm_aesenclast_si128(t1
, ks
[14]);
1044 t2
= _mm_aesenclast_si128(t2
, ks
[14]);
1045 t3
= _mm_aesenclast_si128(t3
, ks
[14]);
1046 t4
= _mm_aesenclast_si128(t4
, ks
[14]);
1048 t1
= _mm_xor_si128(t1
, d1
);
1049 t2
= _mm_xor_si128(t2
, d2
);
1050 t3
= _mm_xor_si128(t3
, d3
);
1051 t4
= _mm_xor_si128(t4
, d4
);
1053 y
= _mm_xor_si128(y
, t1
);
1054 y
= mult4xor(this->hhhh
, this->hhh
, this->hh
, this->h
, y
, t2
, t3
, t4
);
1056 _mm_storeu_si128(bo
+ i
+ 0, t1
);
1057 _mm_storeu_si128(bo
+ i
+ 1, t2
);
1058 _mm_storeu_si128(bo
+ i
+ 2, t3
);
1059 _mm_storeu_si128(bo
+ i
+ 3, t4
);
1062 for (i
= pblocks
; i
< blocks
; i
++)
1064 d1
= _mm_loadu_si128(bi
+ i
);
1066 t1
= _mm_xor_si128(cb
, ks
[0]);
1067 t1
= _mm_aesenc_si128(t1
, ks
[1]);
1068 t1
= _mm_aesenc_si128(t1
, ks
[2]);
1069 t1
= _mm_aesenc_si128(t1
, ks
[3]);
1070 t1
= _mm_aesenc_si128(t1
, ks
[4]);
1071 t1
= _mm_aesenc_si128(t1
, ks
[5]);
1072 t1
= _mm_aesenc_si128(t1
, ks
[6]);
1073 t1
= _mm_aesenc_si128(t1
, ks
[7]);
1074 t1
= _mm_aesenc_si128(t1
, ks
[8]);
1075 t1
= _mm_aesenc_si128(t1
, ks
[9]);
1076 t1
= _mm_aesenc_si128(t1
, ks
[10]);
1077 t1
= _mm_aesenc_si128(t1
, ks
[11]);
1078 t1
= _mm_aesenc_si128(t1
, ks
[12]);
1079 t1
= _mm_aesenc_si128(t1
, ks
[13]);
1080 t1
= _mm_aesenclast_si128(t1
, ks
[14]);
1082 t1
= _mm_xor_si128(t1
, d1
);
1083 _mm_storeu_si128(bo
+ i
, t1
);
1085 y
= ghash(this->h
, y
, t1
);
1087 cb
= increment_be(cb
);
1092 y
= encrypt_gcm_rem(this, rem
, bi
+ blocks
, bo
+ blocks
, cb
, y
);
1094 y
= icv_tailer(this, y
, alen
, len
);
1095 icv_crypt(this, y
, j
, icv
);
1099 * AES-256 GCM decryption/ICV generation
1101 static void decrypt_gcm256(private_aesni_gcm_t
*this,
1102 size_t len
, u_char
*in
, u_char
*out
, u_char
*iv
,
1103 size_t alen
, u_char
*assoc
, u_char
*icv
)
1105 __m128i d1
, d2
, d3
, d4
, t1
, t2
, t3
, t4
;
1106 __m128i
*ks
, y
, j
, cb
, *bi
, *bo
;
1107 u_int blocks
, pblocks
, rem
, i
;
1109 j
= create_j(this, iv
);
1110 cb
= increment_be(j
);
1111 y
= icv_header(this, assoc
, alen
);
1112 blocks
= len
/ AES_BLOCK_SIZE
;
1113 pblocks
= blocks
- (blocks
% GCM_CRYPT_PARALLELISM
);
1114 rem
= len
% AES_BLOCK_SIZE
;
1118 ks
= this->key
->schedule
;
1120 for (i
= 0; i
< pblocks
; i
+= GCM_CRYPT_PARALLELISM
)
1122 d1
= _mm_loadu_si128(bi
+ i
+ 0);
1123 d2
= _mm_loadu_si128(bi
+ i
+ 1);
1124 d3
= _mm_loadu_si128(bi
+ i
+ 2);
1125 d4
= _mm_loadu_si128(bi
+ i
+ 3);
1127 y
= _mm_xor_si128(y
, d1
);
1128 y
= mult4xor(this->hhhh
, this->hhh
, this->hh
, this->h
, y
, d2
, d3
, d4
);
1130 t1
= _mm_xor_si128(cb
, ks
[0]);
1131 cb
= increment_be(cb
);
1132 t2
= _mm_xor_si128(cb
, ks
[0]);
1133 cb
= increment_be(cb
);
1134 t3
= _mm_xor_si128(cb
, ks
[0]);
1135 cb
= increment_be(cb
);
1136 t4
= _mm_xor_si128(cb
, ks
[0]);
1137 cb
= increment_be(cb
);
1139 t1
= _mm_aesenc_si128(t1
, ks
[1]);
1140 t2
= _mm_aesenc_si128(t2
, ks
[1]);
1141 t3
= _mm_aesenc_si128(t3
, ks
[1]);
1142 t4
= _mm_aesenc_si128(t4
, ks
[1]);
1143 t1
= _mm_aesenc_si128(t1
, ks
[2]);
1144 t2
= _mm_aesenc_si128(t2
, ks
[2]);
1145 t3
= _mm_aesenc_si128(t3
, ks
[2]);
1146 t4
= _mm_aesenc_si128(t4
, ks
[2]);
1147 t1
= _mm_aesenc_si128(t1
, ks
[3]);
1148 t2
= _mm_aesenc_si128(t2
, ks
[3]);
1149 t3
= _mm_aesenc_si128(t3
, ks
[3]);
1150 t4
= _mm_aesenc_si128(t4
, ks
[3]);
1151 t1
= _mm_aesenc_si128(t1
, ks
[4]);
1152 t2
= _mm_aesenc_si128(t2
, ks
[4]);
1153 t3
= _mm_aesenc_si128(t3
, ks
[4]);
1154 t4
= _mm_aesenc_si128(t4
, ks
[4]);
1155 t1
= _mm_aesenc_si128(t1
, ks
[5]);
1156 t2
= _mm_aesenc_si128(t2
, ks
[5]);
1157 t3
= _mm_aesenc_si128(t3
, ks
[5]);
1158 t4
= _mm_aesenc_si128(t4
, ks
[5]);
1159 t1
= _mm_aesenc_si128(t1
, ks
[6]);
1160 t2
= _mm_aesenc_si128(t2
, ks
[6]);
1161 t3
= _mm_aesenc_si128(t3
, ks
[6]);
1162 t4
= _mm_aesenc_si128(t4
, ks
[6]);
1163 t1
= _mm_aesenc_si128(t1
, ks
[7]);
1164 t2
= _mm_aesenc_si128(t2
, ks
[7]);
1165 t3
= _mm_aesenc_si128(t3
, ks
[7]);
1166 t4
= _mm_aesenc_si128(t4
, ks
[7]);
1167 t1
= _mm_aesenc_si128(t1
, ks
[8]);
1168 t2
= _mm_aesenc_si128(t2
, ks
[8]);
1169 t3
= _mm_aesenc_si128(t3
, ks
[8]);
1170 t4
= _mm_aesenc_si128(t4
, ks
[8]);
1171 t1
= _mm_aesenc_si128(t1
, ks
[9]);
1172 t2
= _mm_aesenc_si128(t2
, ks
[9]);
1173 t3
= _mm_aesenc_si128(t3
, ks
[9]);
1174 t4
= _mm_aesenc_si128(t4
, ks
[9]);
1175 t1
= _mm_aesenc_si128(t1
, ks
[10]);
1176 t2
= _mm_aesenc_si128(t2
, ks
[10]);
1177 t3
= _mm_aesenc_si128(t3
, ks
[10]);
1178 t4
= _mm_aesenc_si128(t4
, ks
[10]);
1179 t1
= _mm_aesenc_si128(t1
, ks
[11]);
1180 t2
= _mm_aesenc_si128(t2
, ks
[11]);
1181 t3
= _mm_aesenc_si128(t3
, ks
[11]);
1182 t4
= _mm_aesenc_si128(t4
, ks
[11]);
1183 t1
= _mm_aesenc_si128(t1
, ks
[12]);
1184 t2
= _mm_aesenc_si128(t2
, ks
[12]);
1185 t3
= _mm_aesenc_si128(t3
, ks
[12]);
1186 t4
= _mm_aesenc_si128(t4
, ks
[12]);
1187 t1
= _mm_aesenc_si128(t1
, ks
[13]);
1188 t2
= _mm_aesenc_si128(t2
, ks
[13]);
1189 t3
= _mm_aesenc_si128(t3
, ks
[13]);
1190 t4
= _mm_aesenc_si128(t4
, ks
[13]);
1192 t1
= _mm_aesenclast_si128(t1
, ks
[14]);
1193 t2
= _mm_aesenclast_si128(t2
, ks
[14]);
1194 t3
= _mm_aesenclast_si128(t3
, ks
[14]);
1195 t4
= _mm_aesenclast_si128(t4
, ks
[14]);
1197 t1
= _mm_xor_si128(t1
, d1
);
1198 t2
= _mm_xor_si128(t2
, d2
);
1199 t3
= _mm_xor_si128(t3
, d3
);
1200 t4
= _mm_xor_si128(t4
, d4
);
1202 _mm_storeu_si128(bo
+ i
+ 0, t1
);
1203 _mm_storeu_si128(bo
+ i
+ 1, t2
);
1204 _mm_storeu_si128(bo
+ i
+ 2, t3
);
1205 _mm_storeu_si128(bo
+ i
+ 3, t4
);
1208 for (i
= pblocks
; i
< blocks
; i
++)
1210 d1
= _mm_loadu_si128(bi
+ i
);
1212 y
= ghash(this->h
, y
, d1
);
1214 t1
= _mm_xor_si128(cb
, ks
[0]);
1215 t1
= _mm_aesenc_si128(t1
, ks
[1]);
1216 t1
= _mm_aesenc_si128(t1
, ks
[2]);
1217 t1
= _mm_aesenc_si128(t1
, ks
[3]);
1218 t1
= _mm_aesenc_si128(t1
, ks
[4]);
1219 t1
= _mm_aesenc_si128(t1
, ks
[5]);
1220 t1
= _mm_aesenc_si128(t1
, ks
[6]);
1221 t1
= _mm_aesenc_si128(t1
, ks
[7]);
1222 t1
= _mm_aesenc_si128(t1
, ks
[8]);
1223 t1
= _mm_aesenc_si128(t1
, ks
[9]);
1224 t1
= _mm_aesenc_si128(t1
, ks
[10]);
1225 t1
= _mm_aesenc_si128(t1
, ks
[11]);
1226 t1
= _mm_aesenc_si128(t1
, ks
[12]);
1227 t1
= _mm_aesenc_si128(t1
, ks
[13]);
1228 t1
= _mm_aesenclast_si128(t1
, ks
[14]);
1230 t1
= _mm_xor_si128(t1
, d1
);
1231 _mm_storeu_si128(bo
+ i
, t1
);
1233 cb
= increment_be(cb
);
1238 y
= decrypt_gcm_rem(this, rem
, bi
+ blocks
, bo
+ blocks
, cb
, y
);
1240 y
= icv_tailer(this, y
, alen
, len
);
1241 icv_crypt(this, y
, j
, icv
);
1244 METHOD(aead_t
, encrypt
, bool,
1245 private_aesni_gcm_t
*this, chunk_t plain
, chunk_t assoc
, chunk_t iv
,
1250 if (!this->key
|| iv
.len
!= IV_SIZE
)
1257 *encr
= chunk_alloc(plain
.len
+ this->icv_size
);
1260 this->encrypt(this, plain
.len
, plain
.ptr
, out
, iv
.ptr
,
1261 assoc
.len
, assoc
.ptr
, out
+ plain
.len
);
1265 METHOD(aead_t
, decrypt
, bool,
1266 private_aesni_gcm_t
*this, chunk_t encr
, chunk_t assoc
, chunk_t iv
,
1269 u_char
*out
, icv
[this->icv_size
];
1271 if (!this->key
|| iv
.len
!= IV_SIZE
|| encr
.len
< this->icv_size
)
1275 encr
.len
-= this->icv_size
;
1279 *plain
= chunk_alloc(encr
.len
);
1282 this->decrypt(this, encr
.len
, encr
.ptr
, out
, iv
.ptr
,
1283 assoc
.len
, assoc
.ptr
, icv
);
1284 return memeq_const(icv
, encr
.ptr
+ encr
.len
, this->icv_size
);
1287 METHOD(aead_t
, get_block_size
, size_t,
1288 private_aesni_gcm_t
*this)
1293 METHOD(aead_t
, get_icv_size
, size_t,
1294 private_aesni_gcm_t
*this)
1296 return this->icv_size
;
1299 METHOD(aead_t
, get_iv_size
, size_t,
1300 private_aesni_gcm_t
*this)
1305 METHOD(aead_t
, get_iv_gen
, iv_gen_t
*,
1306 private_aesni_gcm_t
*this)
1308 return this->iv_gen
;
1311 METHOD(aead_t
, get_key_size
, size_t,
1312 private_aesni_gcm_t
*this)
1314 return this->key_size
+ SALT_SIZE
;
1317 METHOD(aead_t
, set_key
, bool,
1318 private_aesni_gcm_t
*this, chunk_t key
)
1323 if (key
.len
!= this->key_size
+ SALT_SIZE
)
1328 memcpy(this->salt
, key
.ptr
+ key
.len
- SALT_SIZE
, SALT_SIZE
);
1329 key
.len
-= SALT_SIZE
;
1331 DESTROY_IF(this->key
);
1332 this->key
= aesni_key_create(TRUE
, key
);
1334 ks
= this->key
->schedule
;
1335 h
= _mm_xor_si128(_mm_setzero_si128(), ks
[0]);
1336 for (round
= 1; round
< this->key
->rounds
; round
++)
1338 h
= _mm_aesenc_si128(h
, ks
[round
]);
1340 h
= _mm_aesenclast_si128(h
, ks
[this->key
->rounds
]);
1344 this->hh
= mult_block(h
, this->h
);
1345 this->hhh
= mult_block(h
, this->hh
);
1346 this->hhhh
= mult_block(h
, this->hhh
);
1347 this->h
= swap128(this->h
);
1348 this->hh
= swap128(this->hh
);
1349 this->hhh
= swap128(this->hhh
);
1350 this->hhhh
= swap128(this->hhhh
);
1355 METHOD(aead_t
, destroy
, void,
1356 private_aesni_gcm_t
*this)
1358 DESTROY_IF(this->key
);
1359 memwipe(&this->h
, sizeof(this->h
));
1360 memwipe(&this->hh
, sizeof(this->hh
));
1361 memwipe(&this->hhh
, sizeof(this->hhh
));
1362 memwipe(&this->hhhh
, sizeof(this->hhhh
));
1363 this->iv_gen
->destroy(this->iv_gen
);
1370 aesni_gcm_t
*aesni_gcm_create(encryption_algorithm_t algo
,
1371 size_t key_size
, size_t salt_size
)
1373 private_aesni_gcm_t
*this;
1388 if (salt_size
&& salt_size
!= SALT_SIZE
)
1390 /* currently not supported */
1395 case ENCR_AES_GCM_ICV8
:
1396 algo
= ENCR_AES_CBC
;
1399 case ENCR_AES_GCM_ICV12
:
1400 algo
= ENCR_AES_CBC
;
1403 case ENCR_AES_GCM_ICV16
:
1404 algo
= ENCR_AES_CBC
;
1411 INIT_ALIGN(this, sizeof(__m128i
),
1414 .encrypt
= _encrypt
,
1415 .decrypt
= _decrypt
,
1416 .get_block_size
= _get_block_size
,
1417 .get_icv_size
= _get_icv_size
,
1418 .get_iv_size
= _get_iv_size
,
1419 .get_iv_gen
= _get_iv_gen
,
1420 .get_key_size
= _get_key_size
,
1421 .set_key
= _set_key
,
1422 .destroy
= _destroy
,
1425 .key_size
= key_size
,
1426 .iv_gen
= iv_gen_seq_create(),
1427 .icv_size
= icv_size
,
1433 this->encrypt
= encrypt_gcm128
;
1434 this->decrypt
= decrypt_gcm128
;
1437 this->encrypt
= encrypt_gcm192
;
1438 this->decrypt
= decrypt_gcm192
;
1441 this->encrypt
= encrypt_gcm256
;
1442 this->decrypt
= decrypt_gcm256
;
1446 return &this->public;