]> git.ipfire.org Git - people/ms/strongswan.git/blame - src/libstrongswan/plugins/aesni/aesni_ctr.c
aesni: Use 4-way parallel AES-NI instructions for CTR en/decryption
[people/ms/strongswan.git] / src / libstrongswan / plugins / aesni / aesni_ctr.c
CommitLineData
74d43cbd
MW
1/*
2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the
7 * Free Software Foundation; either version 2 of the License, or (at your
8 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * for more details.
14 */
15
16#include "aesni_ctr.h"
17#include "aesni_key.h"
18
19#include <tmmintrin.h>
20
8488dea2
MW
21/**
22 * Pipeline parallelism we use for CTR en/decryption
23 */
24#define CTR_CRYPT_PARALLELISM 4
25
74d43cbd
MW
26typedef struct private_aesni_ctr_t private_aesni_ctr_t;
27
28/**
29 * CTR en/decryption method type
30 */
31typedef void (*aesni_ctr_fn_t)(private_aesni_ctr_t*, size_t, u_char*, u_char*);
32
33/**
34 * Private data of an aesni_ctr_t object.
35 */
36struct private_aesni_ctr_t {
37
38 /**
39 * Public aesni_ctr_t interface.
40 */
41 aesni_ctr_t public;
42
43 /**
44 * Key size
45 */
46 u_int key_size;
47
48 /**
49 * Key schedule
50 */
51 aesni_key_t *key;
52
53 /**
54 * Encryption method
55 */
56 aesni_ctr_fn_t crypt;
57
58 /**
59 * Counter state
60 */
61 struct {
62 char nonce[4];
63 char iv[8];
64 u_int32_t counter;
65 } __attribute__((packed, aligned(sizeof(__m128i)))) state;
66};
67
68/**
9e47c1fe 69 * Do big-endian increment on x
74d43cbd 70 */
9e47c1fe 71static inline __m128i increment_be(__m128i x)
74d43cbd 72{
9e47c1fe 73 __m128i swap;
74d43cbd 74
74d43cbd 75 swap = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
9e47c1fe
MW
76
77 x = _mm_shuffle_epi8(x, swap);
78 x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
79 x = _mm_shuffle_epi8(x, swap);
80
81 return x;
82}
83
84/**
85 * AES-128 CTR encryption
86 */
87static void encrypt_ctr128(private_aesni_ctr_t *this,
88 size_t len, u_char *in, u_char *out)
89{
90 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
8488dea2
MW
91 __m128i t1, t2, t3, t4;
92 __m128i d1, d2, d3, d4;
93 __m128i state, b, *bi, *bo;
94 u_int i, blocks, pblocks, rem;
9e47c1fe
MW
95
96 state = _mm_load_si128((__m128i*)&this->state);
97 blocks = len / AES_BLOCK_SIZE;
8488dea2 98 pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
9e47c1fe
MW
99 rem = len % AES_BLOCK_SIZE;
100 bi = (__m128i*)in;
101 bo = (__m128i*)out;
102
103 k0 = this->key->schedule[0];
104 k1 = this->key->schedule[1];
105 k2 = this->key->schedule[2];
106 k3 = this->key->schedule[3];
107 k4 = this->key->schedule[4];
108 k5 = this->key->schedule[5];
109 k6 = this->key->schedule[6];
110 k7 = this->key->schedule[7];
111 k8 = this->key->schedule[8];
112 k9 = this->key->schedule[9];
113 k10 = this->key->schedule[10];
114
8488dea2 115 for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
9e47c1fe 116 {
8488dea2
MW
117 d1 = _mm_loadu_si128(bi + i + 0);
118 d2 = _mm_loadu_si128(bi + i + 1);
119 d3 = _mm_loadu_si128(bi + i + 2);
120 d4 = _mm_loadu_si128(bi + i + 3);
9e47c1fe 121
8488dea2
MW
122 t1 = _mm_xor_si128(state, k0);
123 state = increment_be(state);
124 t2 = _mm_xor_si128(state, k0);
125 state = increment_be(state);
126 t3 = _mm_xor_si128(state, k0);
9e47c1fe 127 state = increment_be(state);
8488dea2
MW
128 t4 = _mm_xor_si128(state, k0);
129 state = increment_be(state);
130
131 t1 = _mm_aesenc_si128(t1, k1);
132 t2 = _mm_aesenc_si128(t2, k1);
133 t3 = _mm_aesenc_si128(t3, k1);
134 t4 = _mm_aesenc_si128(t4, k1);
135 t1 = _mm_aesenc_si128(t1, k2);
136 t2 = _mm_aesenc_si128(t2, k2);
137 t3 = _mm_aesenc_si128(t3, k2);
138 t4 = _mm_aesenc_si128(t4, k2);
139 t1 = _mm_aesenc_si128(t1, k3);
140 t2 = _mm_aesenc_si128(t2, k3);
141 t3 = _mm_aesenc_si128(t3, k3);
142 t4 = _mm_aesenc_si128(t4, k3);
143 t1 = _mm_aesenc_si128(t1, k4);
144 t2 = _mm_aesenc_si128(t2, k4);
145 t3 = _mm_aesenc_si128(t3, k4);
146 t4 = _mm_aesenc_si128(t4, k4);
147 t1 = _mm_aesenc_si128(t1, k5);
148 t2 = _mm_aesenc_si128(t2, k5);
149 t3 = _mm_aesenc_si128(t3, k5);
150 t4 = _mm_aesenc_si128(t4, k5);
151 t1 = _mm_aesenc_si128(t1, k6);
152 t2 = _mm_aesenc_si128(t2, k6);
153 t3 = _mm_aesenc_si128(t3, k6);
154 t4 = _mm_aesenc_si128(t4, k6);
155 t1 = _mm_aesenc_si128(t1, k7);
156 t2 = _mm_aesenc_si128(t2, k7);
157 t3 = _mm_aesenc_si128(t3, k7);
158 t4 = _mm_aesenc_si128(t4, k7);
159 t1 = _mm_aesenc_si128(t1, k8);
160 t2 = _mm_aesenc_si128(t2, k8);
161 t3 = _mm_aesenc_si128(t3, k8);
162 t4 = _mm_aesenc_si128(t4, k8);
163 t1 = _mm_aesenc_si128(t1, k9);
164 t2 = _mm_aesenc_si128(t2, k9);
165 t3 = _mm_aesenc_si128(t3, k9);
166 t4 = _mm_aesenc_si128(t4, k9);
167
168 t1 = _mm_aesenclast_si128(t1, k10);
169 t2 = _mm_aesenclast_si128(t2, k10);
170 t3 = _mm_aesenclast_si128(t3, k10);
171 t4 = _mm_aesenclast_si128(t4, k10);
172 t1 = _mm_xor_si128(t1, d1);
173 t2 = _mm_xor_si128(t2, d2);
174 t3 = _mm_xor_si128(t3, d3);
175 t4 = _mm_xor_si128(t4, d4);
176 _mm_storeu_si128(bo + i + 0, t1);
177 _mm_storeu_si128(bo + i + 1, t2);
178 _mm_storeu_si128(bo + i + 2, t3);
179 _mm_storeu_si128(bo + i + 3, t4);
180 }
181
182 for (i = pblocks; i < blocks; i++)
183 {
184 d1 = _mm_loadu_si128(bi + i);
185
186 t1 = _mm_xor_si128(state, k0);
187 state = increment_be(state);
188
189 t1 = _mm_aesenc_si128(t1, k1);
190 t1 = _mm_aesenc_si128(t1, k2);
191 t1 = _mm_aesenc_si128(t1, k3);
192 t1 = _mm_aesenc_si128(t1, k4);
193 t1 = _mm_aesenc_si128(t1, k5);
194 t1 = _mm_aesenc_si128(t1, k6);
195 t1 = _mm_aesenc_si128(t1, k7);
196 t1 = _mm_aesenc_si128(t1, k8);
197 t1 = _mm_aesenc_si128(t1, k9);
198
199 t1 = _mm_aesenclast_si128(t1, k10);
200 t1 = _mm_xor_si128(t1, d1);
201 _mm_storeu_si128(bo + i, t1);
9e47c1fe
MW
202 }
203
204 if (rem)
205 {
206 memset(&b, 0, sizeof(b));
207 memcpy(&b, bi + blocks, rem);
208
8488dea2
MW
209 d1 = _mm_loadu_si128(&b);
210 t1 = _mm_xor_si128(state, k0);
9e47c1fe 211
8488dea2
MW
212 t1 = _mm_aesenc_si128(t1, k1);
213 t1 = _mm_aesenc_si128(t1, k2);
214 t1 = _mm_aesenc_si128(t1, k3);
215 t1 = _mm_aesenc_si128(t1, k4);
216 t1 = _mm_aesenc_si128(t1, k5);
217 t1 = _mm_aesenc_si128(t1, k6);
218 t1 = _mm_aesenc_si128(t1, k7);
219 t1 = _mm_aesenc_si128(t1, k8);
220 t1 = _mm_aesenc_si128(t1, k9);
9e47c1fe 221
8488dea2
MW
222 t1 = _mm_aesenclast_si128(t1, k10);
223 t1 = _mm_xor_si128(t1, d1);
224 _mm_storeu_si128(&b, t1);
9e47c1fe
MW
225
226 memcpy(bo + blocks, &b, rem);
227 }
228}
229
230/**
231 * AES-192 CTR encryption
232 */
233static void encrypt_ctr192(private_aesni_ctr_t *this,
234 size_t len, u_char *in, u_char *out)
235{
236 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
8488dea2
MW
237 __m128i t1, t2, t3, t4;
238 __m128i d1, d2, d3, d4;
239 __m128i state, b, *bi, *bo;
240 u_int i, blocks, pblocks, rem;
9e47c1fe 241
74d43cbd
MW
242 state = _mm_load_si128((__m128i*)&this->state);
243 blocks = len / AES_BLOCK_SIZE;
8488dea2 244 pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
74d43cbd
MW
245 rem = len % AES_BLOCK_SIZE;
246 bi = (__m128i*)in;
247 bo = (__m128i*)out;
248
9e47c1fe
MW
249 k0 = this->key->schedule[0];
250 k1 = this->key->schedule[1];
251 k2 = this->key->schedule[2];
252 k3 = this->key->schedule[3];
253 k4 = this->key->schedule[4];
254 k5 = this->key->schedule[5];
255 k6 = this->key->schedule[6];
256 k7 = this->key->schedule[7];
257 k8 = this->key->schedule[8];
258 k9 = this->key->schedule[9];
259 k10 = this->key->schedule[10];
260 k11 = this->key->schedule[11];
261 k12 = this->key->schedule[12];
262
8488dea2 263 for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
74d43cbd 264 {
8488dea2
MW
265 d1 = _mm_loadu_si128(bi + i + 0);
266 d2 = _mm_loadu_si128(bi + i + 1);
267 d3 = _mm_loadu_si128(bi + i + 2);
268 d4 = _mm_loadu_si128(bi + i + 3);
74d43cbd 269
8488dea2 270 t1 = _mm_xor_si128(state, k0);
9e47c1fe 271 state = increment_be(state);
8488dea2
MW
272 t2 = _mm_xor_si128(state, k0);
273 state = increment_be(state);
274 t3 = _mm_xor_si128(state, k0);
275 state = increment_be(state);
276 t4 = _mm_xor_si128(state, k0);
277 state = increment_be(state);
278
279 t1 = _mm_aesenc_si128(t1, k1);
280 t2 = _mm_aesenc_si128(t2, k1);
281 t3 = _mm_aesenc_si128(t3, k1);
282 t4 = _mm_aesenc_si128(t4, k1);
283 t1 = _mm_aesenc_si128(t1, k2);
284 t2 = _mm_aesenc_si128(t2, k2);
285 t3 = _mm_aesenc_si128(t3, k2);
286 t4 = _mm_aesenc_si128(t4, k2);
287 t1 = _mm_aesenc_si128(t1, k3);
288 t2 = _mm_aesenc_si128(t2, k3);
289 t3 = _mm_aesenc_si128(t3, k3);
290 t4 = _mm_aesenc_si128(t4, k3);
291 t1 = _mm_aesenc_si128(t1, k4);
292 t2 = _mm_aesenc_si128(t2, k4);
293 t3 = _mm_aesenc_si128(t3, k4);
294 t4 = _mm_aesenc_si128(t4, k4);
295 t1 = _mm_aesenc_si128(t1, k5);
296 t2 = _mm_aesenc_si128(t2, k5);
297 t3 = _mm_aesenc_si128(t3, k5);
298 t4 = _mm_aesenc_si128(t4, k5);
299 t1 = _mm_aesenc_si128(t1, k6);
300 t2 = _mm_aesenc_si128(t2, k6);
301 t3 = _mm_aesenc_si128(t3, k6);
302 t4 = _mm_aesenc_si128(t4, k6);
303 t1 = _mm_aesenc_si128(t1, k7);
304 t2 = _mm_aesenc_si128(t2, k7);
305 t3 = _mm_aesenc_si128(t3, k7);
306 t4 = _mm_aesenc_si128(t4, k7);
307 t1 = _mm_aesenc_si128(t1, k8);
308 t2 = _mm_aesenc_si128(t2, k8);
309 t3 = _mm_aesenc_si128(t3, k8);
310 t4 = _mm_aesenc_si128(t4, k8);
311 t1 = _mm_aesenc_si128(t1, k9);
312 t2 = _mm_aesenc_si128(t2, k9);
313 t3 = _mm_aesenc_si128(t3, k9);
314 t4 = _mm_aesenc_si128(t4, k9);
315 t1 = _mm_aesenc_si128(t1, k10);
316 t2 = _mm_aesenc_si128(t2, k10);
317 t3 = _mm_aesenc_si128(t3, k10);
318 t4 = _mm_aesenc_si128(t4, k10);
319 t1 = _mm_aesenc_si128(t1, k11);
320 t2 = _mm_aesenc_si128(t2, k11);
321 t3 = _mm_aesenc_si128(t3, k11);
322 t4 = _mm_aesenc_si128(t4, k11);
323
324 t1 = _mm_aesenclast_si128(t1, k12);
325 t2 = _mm_aesenclast_si128(t2, k12);
326 t3 = _mm_aesenclast_si128(t3, k12);
327 t4 = _mm_aesenclast_si128(t4, k12);
328 t1 = _mm_xor_si128(t1, d1);
329 t2 = _mm_xor_si128(t2, d2);
330 t3 = _mm_xor_si128(t3, d3);
331 t4 = _mm_xor_si128(t4, d4);
332 _mm_storeu_si128(bo + i + 0, t1);
333 _mm_storeu_si128(bo + i + 1, t2);
334 _mm_storeu_si128(bo + i + 2, t3);
335 _mm_storeu_si128(bo + i + 3, t4);
336 }
337
338 for (i = pblocks; i < blocks; i++)
339 {
340 d1 = _mm_loadu_si128(bi + i);
341
342 t1 = _mm_xor_si128(state, k0);
343 state = increment_be(state);
344
345 t1 = _mm_aesenc_si128(t1, k1);
346 t1 = _mm_aesenc_si128(t1, k2);
347 t1 = _mm_aesenc_si128(t1, k3);
348 t1 = _mm_aesenc_si128(t1, k4);
349 t1 = _mm_aesenc_si128(t1, k5);
350 t1 = _mm_aesenc_si128(t1, k6);
351 t1 = _mm_aesenc_si128(t1, k7);
352 t1 = _mm_aesenc_si128(t1, k8);
353 t1 = _mm_aesenc_si128(t1, k9);
354 t1 = _mm_aesenc_si128(t1, k10);
355 t1 = _mm_aesenc_si128(t1, k11);
356
357 t1 = _mm_aesenclast_si128(t1, k12);
358 t1 = _mm_xor_si128(t1, d1);
359 _mm_storeu_si128(bo + i, t1);
74d43cbd
MW
360 }
361
362 if (rem)
363 {
364 memset(&b, 0, sizeof(b));
365 memcpy(&b, bi + blocks, rem);
366
8488dea2
MW
367 d1 = _mm_loadu_si128(&b);
368 t1 = _mm_xor_si128(state, k0);
369
370 t1 = _mm_aesenc_si128(t1, k1);
371 t1 = _mm_aesenc_si128(t1, k2);
372 t1 = _mm_aesenc_si128(t1, k3);
373 t1 = _mm_aesenc_si128(t1, k4);
374 t1 = _mm_aesenc_si128(t1, k5);
375 t1 = _mm_aesenc_si128(t1, k6);
376 t1 = _mm_aesenc_si128(t1, k7);
377 t1 = _mm_aesenc_si128(t1, k8);
378 t1 = _mm_aesenc_si128(t1, k9);
379 t1 = _mm_aesenc_si128(t1, k10);
380 t1 = _mm_aesenc_si128(t1, k11);
381
382 t1 = _mm_aesenclast_si128(t1, k12);
383 t1 = _mm_xor_si128(t1, d1);
384 _mm_storeu_si128(&b, t1);
9e47c1fe
MW
385
386 memcpy(bo + blocks, &b, rem);
387 }
388}
389
390/**
391 * AES-256 CTR encryption
392 */
393static void encrypt_ctr256(private_aesni_ctr_t *this,
394 size_t len, u_char *in, u_char *out)
395{
396 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
8488dea2
MW
397 __m128i t1, t2, t3, t4;
398 __m128i d1, d2, d3, d4;
399 __m128i state, b, *bi, *bo;
400 u_int i, blocks, pblocks, rem;
9e47c1fe
MW
401
402 state = _mm_load_si128((__m128i*)&this->state);
403 blocks = len / AES_BLOCK_SIZE;
8488dea2 404 pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
9e47c1fe
MW
405 rem = len % AES_BLOCK_SIZE;
406 bi = (__m128i*)in;
407 bo = (__m128i*)out;
408
409 k0 = this->key->schedule[0];
410 k1 = this->key->schedule[1];
411 k2 = this->key->schedule[2];
412 k3 = this->key->schedule[3];
413 k4 = this->key->schedule[4];
414 k5 = this->key->schedule[5];
415 k6 = this->key->schedule[6];
416 k7 = this->key->schedule[7];
417 k8 = this->key->schedule[8];
418 k9 = this->key->schedule[9];
419 k10 = this->key->schedule[10];
420 k11 = this->key->schedule[11];
421 k12 = this->key->schedule[12];
422 k13 = this->key->schedule[13];
423 k14 = this->key->schedule[14];
424
8488dea2
MW
425 for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
426 {
427 d1 = _mm_loadu_si128(bi + i + 0);
428 d2 = _mm_loadu_si128(bi + i + 1);
429 d3 = _mm_loadu_si128(bi + i + 2);
430 d4 = _mm_loadu_si128(bi + i + 3);
431
432 t1 = _mm_xor_si128(state, k0);
433 state = increment_be(state);
434 t2 = _mm_xor_si128(state, k0);
435 state = increment_be(state);
436 t3 = _mm_xor_si128(state, k0);
437 state = increment_be(state);
438 t4 = _mm_xor_si128(state, k0);
439 state = increment_be(state);
440
441 t1 = _mm_aesenc_si128(t1, k1);
442 t2 = _mm_aesenc_si128(t2, k1);
443 t3 = _mm_aesenc_si128(t3, k1);
444 t4 = _mm_aesenc_si128(t4, k1);
445 t1 = _mm_aesenc_si128(t1, k2);
446 t2 = _mm_aesenc_si128(t2, k2);
447 t3 = _mm_aesenc_si128(t3, k2);
448 t4 = _mm_aesenc_si128(t4, k2);
449 t1 = _mm_aesenc_si128(t1, k3);
450 t2 = _mm_aesenc_si128(t2, k3);
451 t3 = _mm_aesenc_si128(t3, k3);
452 t4 = _mm_aesenc_si128(t4, k3);
453 t1 = _mm_aesenc_si128(t1, k4);
454 t2 = _mm_aesenc_si128(t2, k4);
455 t3 = _mm_aesenc_si128(t3, k4);
456 t4 = _mm_aesenc_si128(t4, k4);
457 t1 = _mm_aesenc_si128(t1, k5);
458 t2 = _mm_aesenc_si128(t2, k5);
459 t3 = _mm_aesenc_si128(t3, k5);
460 t4 = _mm_aesenc_si128(t4, k5);
461 t1 = _mm_aesenc_si128(t1, k6);
462 t2 = _mm_aesenc_si128(t2, k6);
463 t3 = _mm_aesenc_si128(t3, k6);
464 t4 = _mm_aesenc_si128(t4, k6);
465 t1 = _mm_aesenc_si128(t1, k7);
466 t2 = _mm_aesenc_si128(t2, k7);
467 t3 = _mm_aesenc_si128(t3, k7);
468 t4 = _mm_aesenc_si128(t4, k7);
469 t1 = _mm_aesenc_si128(t1, k8);
470 t2 = _mm_aesenc_si128(t2, k8);
471 t3 = _mm_aesenc_si128(t3, k8);
472 t4 = _mm_aesenc_si128(t4, k8);
473 t1 = _mm_aesenc_si128(t1, k9);
474 t2 = _mm_aesenc_si128(t2, k9);
475 t3 = _mm_aesenc_si128(t3, k9);
476 t4 = _mm_aesenc_si128(t4, k9);
477 t1 = _mm_aesenc_si128(t1, k10);
478 t2 = _mm_aesenc_si128(t2, k10);
479 t3 = _mm_aesenc_si128(t3, k10);
480 t4 = _mm_aesenc_si128(t4, k10);
481 t1 = _mm_aesenc_si128(t1, k11);
482 t2 = _mm_aesenc_si128(t2, k11);
483 t3 = _mm_aesenc_si128(t3, k11);
484 t4 = _mm_aesenc_si128(t4, k11);
485 t1 = _mm_aesenc_si128(t1, k12);
486 t2 = _mm_aesenc_si128(t2, k12);
487 t3 = _mm_aesenc_si128(t3, k12);
488 t4 = _mm_aesenc_si128(t4, k12);
489 t1 = _mm_aesenc_si128(t1, k13);
490 t2 = _mm_aesenc_si128(t2, k13);
491 t3 = _mm_aesenc_si128(t3, k13);
492 t4 = _mm_aesenc_si128(t4, k13);
493
494 t1 = _mm_aesenclast_si128(t1, k14);
495 t2 = _mm_aesenclast_si128(t2, k14);
496 t3 = _mm_aesenclast_si128(t3, k14);
497 t4 = _mm_aesenclast_si128(t4, k14);
498 t1 = _mm_xor_si128(t1, d1);
499 t2 = _mm_xor_si128(t2, d2);
500 t3 = _mm_xor_si128(t3, d3);
501 t4 = _mm_xor_si128(t4, d4);
502 _mm_storeu_si128(bo + i + 0, t1);
503 _mm_storeu_si128(bo + i + 1, t2);
504 _mm_storeu_si128(bo + i + 2, t3);
505 _mm_storeu_si128(bo + i + 3, t4);
506 }
507
508 for (i = pblocks; i < blocks; i++)
9e47c1fe 509 {
8488dea2 510 d1 = _mm_loadu_si128(bi + i);
9e47c1fe 511
8488dea2 512 t1 = _mm_xor_si128(state, k0);
9e47c1fe 513 state = increment_be(state);
8488dea2
MW
514
515 t1 = _mm_aesenc_si128(t1, k1);
516 t1 = _mm_aesenc_si128(t1, k2);
517 t1 = _mm_aesenc_si128(t1, k3);
518 t1 = _mm_aesenc_si128(t1, k4);
519 t1 = _mm_aesenc_si128(t1, k5);
520 t1 = _mm_aesenc_si128(t1, k6);
521 t1 = _mm_aesenc_si128(t1, k7);
522 t1 = _mm_aesenc_si128(t1, k8);
523 t1 = _mm_aesenc_si128(t1, k9);
524 t1 = _mm_aesenc_si128(t1, k10);
525 t1 = _mm_aesenc_si128(t1, k11);
526 t1 = _mm_aesenc_si128(t1, k12);
527 t1 = _mm_aesenc_si128(t1, k13);
528
529 t1 = _mm_aesenclast_si128(t1, k14);
530 t1 = _mm_xor_si128(t1, d1);
531 _mm_storeu_si128(bo + i, t1);
9e47c1fe
MW
532 }
533
534 if (rem)
535 {
536 memset(&b, 0, sizeof(b));
537 memcpy(&b, bi + blocks, rem);
538
8488dea2
MW
539 d1 = _mm_loadu_si128(&b);
540 t1 = _mm_xor_si128(state, k0);
541
542 t1 = _mm_aesenc_si128(t1, k1);
543 t1 = _mm_aesenc_si128(t1, k2);
544 t1 = _mm_aesenc_si128(t1, k3);
545 t1 = _mm_aesenc_si128(t1, k4);
546 t1 = _mm_aesenc_si128(t1, k5);
547 t1 = _mm_aesenc_si128(t1, k6);
548 t1 = _mm_aesenc_si128(t1, k7);
549 t1 = _mm_aesenc_si128(t1, k8);
550 t1 = _mm_aesenc_si128(t1, k9);
551 t1 = _mm_aesenc_si128(t1, k10);
552 t1 = _mm_aesenc_si128(t1, k11);
553 t1 = _mm_aesenc_si128(t1, k12);
554 t1 = _mm_aesenc_si128(t1, k13);
555
556 t1 = _mm_aesenclast_si128(t1, k14);
557 t1 = _mm_xor_si128(t1, d1);
558 _mm_storeu_si128(&b, t1);
74d43cbd
MW
559
560 memcpy(bo + blocks, &b, rem);
561 }
562}
563
564METHOD(crypter_t, crypt, bool,
565 private_aesni_ctr_t *this, chunk_t in, chunk_t iv, chunk_t *out)
566{
567 u_char *buf;
568
569 if (!this->key || iv.len != sizeof(this->state.iv))
570 {
571 return FALSE;
572 }
573 memcpy(this->state.iv, iv.ptr, sizeof(this->state.iv));
574 this->state.counter = htonl(1);
575
576 buf = in.ptr;
577 if (out)
578 {
579 *out = chunk_alloc(in.len);
580 buf = out->ptr;
581 }
582 this->crypt(this, in.len, in.ptr, buf);
583 return TRUE;
584}
585
586METHOD(crypter_t, get_block_size, size_t,
587 private_aesni_ctr_t *this)
588{
589 return 1;
590}
591
592METHOD(crypter_t, get_iv_size, size_t,
593 private_aesni_ctr_t *this)
594{
595 return sizeof(this->state.iv);
596}
597
598METHOD(crypter_t, get_key_size, size_t,
599 private_aesni_ctr_t *this)
600{
601 return this->key_size + sizeof(this->state.nonce);
602}
603
604METHOD(crypter_t, set_key, bool,
605 private_aesni_ctr_t *this, chunk_t key)
606{
607 if (key.len != get_key_size(this))
608 {
609 return FALSE;
610 }
611
612 memcpy(this->state.nonce, key.ptr + key.len - sizeof(this->state.nonce),
613 sizeof(this->state.nonce));
614 key.len -= sizeof(this->state.nonce);
615
616 DESTROY_IF(this->key);
617 this->key = aesni_key_create(TRUE, key);
618
619 return this->key;
620}
621
622METHOD(crypter_t, destroy, void,
623 private_aesni_ctr_t *this)
624{
625 DESTROY_IF(this->key);
626 free(this);
627}
628
629/**
630 * See header
631 */
632aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size)
633{
634 private_aesni_ctr_t *this;
635
636 if (algo != ENCR_AES_CTR)
637 {
638 return NULL;
639 }
640 switch (key_size)
641 {
642 case 0:
643 key_size = 16;
644 break;
645 case 16:
646 case 24:
647 case 32:
648 break;
649 default:
650 return NULL;
651 }
652
653 INIT(this,
654 .public = {
655 .crypter = {
656 .encrypt = _crypt,
657 .decrypt = _crypt,
658 .get_block_size = _get_block_size,
659 .get_iv_size = _get_iv_size,
660 .get_key_size = _get_key_size,
661 .set_key = _set_key,
662 .destroy = _destroy,
663 },
664 },
665 .key_size = key_size,
74d43cbd
MW
666 );
667
9e47c1fe
MW
668 switch (key_size)
669 {
670 case 16:
671 this->crypt = encrypt_ctr128;
672 break;
673 case 24:
674 this->crypt = encrypt_ctr192;
675 break;
676 case 32:
677 this->crypt = encrypt_ctr256;
678 break;
679 }
680
74d43cbd
MW
681 return &this->public;
682}