]> git.ipfire.org Git - people/ms/strongswan.git/blob - src/libstrongswan/plugins/aesni/aesni_cbc.c
aesni: Use 4-way parallel AES-NI instructions for CBC decryption
[people/ms/strongswan.git] / src / libstrongswan / plugins / aesni / aesni_cbc.c
1 /*
2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License as published by the
7 * Free Software Foundation; either version 2 of the License, or (at your
8 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * for more details.
14 */
15
16 #include "aesni_cbc.h"
17 #include "aesni_key.h"
18
19 /**
20 * Pipeline parallelism we use for CBC decryption
21 */
22 #define CBC_DECRYPT_PARALLELISM 4
23
24 typedef struct private_aesni_cbc_t private_aesni_cbc_t;
25
26 /**
27 * CBC en/decryption method type
28 */
29 typedef void (*aesni_cbc_fn_t)(aesni_key_t*, u_int, u_char*, u_char*, u_char*);
30
31 /**
32 * Private data of an aesni_cbc_t object.
33 */
34 struct private_aesni_cbc_t {
35
36 /**
37 * Public aesni_cbc_t interface.
38 */
39 aesni_cbc_t public;
40
41 /**
42 * Key size
43 */
44 u_int key_size;
45
46 /**
47 * Encryption key schedule
48 */
49 aesni_key_t *ekey;
50
51 /**
52 * Decryption key schedule
53 */
54 aesni_key_t *dkey;
55
56 /**
57 * Encryption method
58 */
59 aesni_cbc_fn_t encrypt;
60
61 /**
62 * Decryption method
63 */
64 aesni_cbc_fn_t decrypt;
65 };
66
67 /**
68 * AES-128 CBC encryption
69 */
70 static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
71 u_char *iv, u_char *out)
72 {
73 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
74 __m128i t, fb, *bi, *bo;
75 int i;
76
77 k0 = key->schedule[0];
78 k1 = key->schedule[1];
79 k2 = key->schedule[2];
80 k3 = key->schedule[3];
81 k4 = key->schedule[4];
82 k5 = key->schedule[5];
83 k6 = key->schedule[6];
84 k7 = key->schedule[7];
85 k8 = key->schedule[8];
86 k9 = key->schedule[9];
87 k10 = key->schedule[10];
88
89 bi = (__m128i*)in;
90 bo = (__m128i*)out;
91
92 fb = _mm_loadu_si128((__m128i*)iv);
93 for (i = 0; i < blocks; i++)
94 {
95 t = _mm_loadu_si128(bi + i);
96 fb = _mm_xor_si128(t, fb);
97 fb = _mm_xor_si128(fb, k0);
98
99 fb = _mm_aesenc_si128(fb, k1);
100 fb = _mm_aesenc_si128(fb, k2);
101 fb = _mm_aesenc_si128(fb, k3);
102 fb = _mm_aesenc_si128(fb, k4);
103 fb = _mm_aesenc_si128(fb, k5);
104 fb = _mm_aesenc_si128(fb, k6);
105 fb = _mm_aesenc_si128(fb, k7);
106 fb = _mm_aesenc_si128(fb, k8);
107 fb = _mm_aesenc_si128(fb, k9);
108
109 fb = _mm_aesenclast_si128(fb, k10);
110 _mm_storeu_si128(bo + i, fb);
111 }
112 }
113
114 /**
115 * AES-128 CBC decryption
116 */
117 static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
118 u_char *iv, u_char *out)
119 {
120 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
121 __m128i last, *bi, *bo;
122 __m128i t1, t2, t3, t4;
123 __m128i f1, f2, f3, f4;
124 u_int i, pblocks;
125
126 k0 = key->schedule[0];
127 k1 = key->schedule[1];
128 k2 = key->schedule[2];
129 k3 = key->schedule[3];
130 k4 = key->schedule[4];
131 k5 = key->schedule[5];
132 k6 = key->schedule[6];
133 k7 = key->schedule[7];
134 k8 = key->schedule[8];
135 k9 = key->schedule[9];
136 k10 = key->schedule[10];
137
138 bi = (__m128i*)in;
139 bo = (__m128i*)out;
140 pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
141
142 f1 = _mm_loadu_si128((__m128i*)iv);
143
144 for (i = 0; i < pblocks; i += CBC_DECRYPT_PARALLELISM)
145 {
146 t1 = _mm_loadu_si128(bi + i + 0);
147 t2 = _mm_loadu_si128(bi + i + 1);
148 t3 = _mm_loadu_si128(bi + i + 2);
149 t4 = _mm_loadu_si128(bi + i + 3);
150
151 f2 = t1;
152 f3 = t2;
153 f4 = t3;
154 last = t4;
155
156 t1 = _mm_xor_si128(t1, k0);
157 t2 = _mm_xor_si128(t2, k0);
158 t3 = _mm_xor_si128(t3, k0);
159 t4 = _mm_xor_si128(t4, k0);
160
161 t1 = _mm_aesdec_si128(t1, k1);
162 t2 = _mm_aesdec_si128(t2, k1);
163 t3 = _mm_aesdec_si128(t3, k1);
164 t4 = _mm_aesdec_si128(t4, k1);
165 t1 = _mm_aesdec_si128(t1, k2);
166 t2 = _mm_aesdec_si128(t2, k2);
167 t3 = _mm_aesdec_si128(t3, k2);
168 t4 = _mm_aesdec_si128(t4, k2);
169 t1 = _mm_aesdec_si128(t1, k3);
170 t2 = _mm_aesdec_si128(t2, k3);
171 t3 = _mm_aesdec_si128(t3, k3);
172 t4 = _mm_aesdec_si128(t4, k3);
173 t1 = _mm_aesdec_si128(t1, k4);
174 t2 = _mm_aesdec_si128(t2, k4);
175 t3 = _mm_aesdec_si128(t3, k4);
176 t4 = _mm_aesdec_si128(t4, k4);
177 t1 = _mm_aesdec_si128(t1, k5);
178 t2 = _mm_aesdec_si128(t2, k5);
179 t3 = _mm_aesdec_si128(t3, k5);
180 t4 = _mm_aesdec_si128(t4, k5);
181 t1 = _mm_aesdec_si128(t1, k6);
182 t2 = _mm_aesdec_si128(t2, k6);
183 t3 = _mm_aesdec_si128(t3, k6);
184 t4 = _mm_aesdec_si128(t4, k6);
185 t1 = _mm_aesdec_si128(t1, k7);
186 t2 = _mm_aesdec_si128(t2, k7);
187 t3 = _mm_aesdec_si128(t3, k7);
188 t4 = _mm_aesdec_si128(t4, k7);
189 t1 = _mm_aesdec_si128(t1, k8);
190 t2 = _mm_aesdec_si128(t2, k8);
191 t3 = _mm_aesdec_si128(t3, k8);
192 t4 = _mm_aesdec_si128(t4, k8);
193 t1 = _mm_aesdec_si128(t1, k9);
194 t2 = _mm_aesdec_si128(t2, k9);
195 t3 = _mm_aesdec_si128(t3, k9);
196 t4 = _mm_aesdec_si128(t4, k9);
197
198 t1 = _mm_aesdeclast_si128(t1, k10);
199 t2 = _mm_aesdeclast_si128(t2, k10);
200 t3 = _mm_aesdeclast_si128(t3, k10);
201 t4 = _mm_aesdeclast_si128(t4, k10);
202 t1 = _mm_xor_si128(t1, f1);
203 t2 = _mm_xor_si128(t2, f2);
204 t3 = _mm_xor_si128(t3, f3);
205 t4 = _mm_xor_si128(t4, f4);
206 _mm_storeu_si128(bo + i + 0, t1);
207 _mm_storeu_si128(bo + i + 1, t2);
208 _mm_storeu_si128(bo + i + 2, t3);
209 _mm_storeu_si128(bo + i + 3, t4);
210 f1 = last;
211 }
212
213 for (i = pblocks; i < blocks; i++)
214 {
215 last = _mm_loadu_si128(bi + i);
216 t1 = _mm_xor_si128(last, k0);
217
218 t1 = _mm_aesdec_si128(t1, k1);
219 t1 = _mm_aesdec_si128(t1, k2);
220 t1 = _mm_aesdec_si128(t1, k3);
221 t1 = _mm_aesdec_si128(t1, k4);
222 t1 = _mm_aesdec_si128(t1, k5);
223 t1 = _mm_aesdec_si128(t1, k6);
224 t1 = _mm_aesdec_si128(t1, k7);
225 t1 = _mm_aesdec_si128(t1, k8);
226 t1 = _mm_aesdec_si128(t1, k9);
227
228 t1 = _mm_aesdeclast_si128(t1, k10);
229 t1 = _mm_xor_si128(t1, f1);
230 _mm_storeu_si128(bo + i, t1);
231 f1 = last;
232 }
233 }
234
235 /**
236 * AES-192 CBC encryption
237 */
238 static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
239 u_char *iv, u_char *out)
240 {
241 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
242 __m128i t, fb, *bi, *bo;
243 int i;
244
245 k0 = key->schedule[0];
246 k1 = key->schedule[1];
247 k2 = key->schedule[2];
248 k3 = key->schedule[3];
249 k4 = key->schedule[4];
250 k5 = key->schedule[5];
251 k6 = key->schedule[6];
252 k7 = key->schedule[7];
253 k8 = key->schedule[8];
254 k9 = key->schedule[9];
255 k10 = key->schedule[10];
256 k11 = key->schedule[11];
257 k12 = key->schedule[12];
258
259 bi = (__m128i*)in;
260 bo = (__m128i*)out;
261
262 fb = _mm_loadu_si128((__m128i*)iv);
263 for (i = 0; i < blocks; i++)
264 {
265 t = _mm_loadu_si128(bi + i);
266 fb = _mm_xor_si128(t, fb);
267 fb = _mm_xor_si128(fb, k0);
268
269 fb = _mm_aesenc_si128(fb, k1);
270 fb = _mm_aesenc_si128(fb, k2);
271 fb = _mm_aesenc_si128(fb, k3);
272 fb = _mm_aesenc_si128(fb, k4);
273 fb = _mm_aesenc_si128(fb, k5);
274 fb = _mm_aesenc_si128(fb, k6);
275 fb = _mm_aesenc_si128(fb, k7);
276 fb = _mm_aesenc_si128(fb, k8);
277 fb = _mm_aesenc_si128(fb, k9);
278 fb = _mm_aesenc_si128(fb, k10);
279 fb = _mm_aesenc_si128(fb, k11);
280
281 fb = _mm_aesenclast_si128(fb, k12);
282 _mm_storeu_si128(bo + i, fb);
283 }
284 }
285
286 /**
287 * AES-192 CBC decryption
288 */
289 static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
290 u_char *iv, u_char *out)
291 {
292 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
293 __m128i last, *bi, *bo;
294 __m128i t1, t2, t3, t4;
295 __m128i f1, f2, f3, f4;
296 u_int i, pblocks;
297
298 k0 = key->schedule[0];
299 k1 = key->schedule[1];
300 k2 = key->schedule[2];
301 k3 = key->schedule[3];
302 k4 = key->schedule[4];
303 k5 = key->schedule[5];
304 k6 = key->schedule[6];
305 k7 = key->schedule[7];
306 k8 = key->schedule[8];
307 k9 = key->schedule[9];
308 k10 = key->schedule[10];
309 k11 = key->schedule[11];
310 k12 = key->schedule[12];
311
312 bi = (__m128i*)in;
313 bo = (__m128i*)out;
314 pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
315
316 f1 = _mm_loadu_si128((__m128i*)iv);
317
318 for (i = 0; i < pblocks; i += CBC_DECRYPT_PARALLELISM)
319 {
320 t1 = _mm_loadu_si128(bi + i + 0);
321 t2 = _mm_loadu_si128(bi + i + 1);
322 t3 = _mm_loadu_si128(bi + i + 2);
323 t4 = _mm_loadu_si128(bi + i + 3);
324
325 f2 = t1;
326 f3 = t2;
327 f4 = t3;
328 last = t4;
329
330 t1 = _mm_xor_si128(t1, k0);
331 t2 = _mm_xor_si128(t2, k0);
332 t3 = _mm_xor_si128(t3, k0);
333 t4 = _mm_xor_si128(t4, k0);
334
335 t1 = _mm_aesdec_si128(t1, k1);
336 t2 = _mm_aesdec_si128(t2, k1);
337 t3 = _mm_aesdec_si128(t3, k1);
338 t4 = _mm_aesdec_si128(t4, k1);
339 t1 = _mm_aesdec_si128(t1, k2);
340 t2 = _mm_aesdec_si128(t2, k2);
341 t3 = _mm_aesdec_si128(t3, k2);
342 t4 = _mm_aesdec_si128(t4, k2);
343 t1 = _mm_aesdec_si128(t1, k3);
344 t2 = _mm_aesdec_si128(t2, k3);
345 t3 = _mm_aesdec_si128(t3, k3);
346 t4 = _mm_aesdec_si128(t4, k3);
347 t1 = _mm_aesdec_si128(t1, k4);
348 t2 = _mm_aesdec_si128(t2, k4);
349 t3 = _mm_aesdec_si128(t3, k4);
350 t4 = _mm_aesdec_si128(t4, k4);
351 t1 = _mm_aesdec_si128(t1, k5);
352 t2 = _mm_aesdec_si128(t2, k5);
353 t3 = _mm_aesdec_si128(t3, k5);
354 t4 = _mm_aesdec_si128(t4, k5);
355 t1 = _mm_aesdec_si128(t1, k6);
356 t2 = _mm_aesdec_si128(t2, k6);
357 t3 = _mm_aesdec_si128(t3, k6);
358 t4 = _mm_aesdec_si128(t4, k6);
359 t1 = _mm_aesdec_si128(t1, k7);
360 t2 = _mm_aesdec_si128(t2, k7);
361 t3 = _mm_aesdec_si128(t3, k7);
362 t4 = _mm_aesdec_si128(t4, k7);
363 t1 = _mm_aesdec_si128(t1, k8);
364 t2 = _mm_aesdec_si128(t2, k8);
365 t3 = _mm_aesdec_si128(t3, k8);
366 t4 = _mm_aesdec_si128(t4, k8);
367 t1 = _mm_aesdec_si128(t1, k9);
368 t2 = _mm_aesdec_si128(t2, k9);
369 t3 = _mm_aesdec_si128(t3, k9);
370 t4 = _mm_aesdec_si128(t4, k9);
371 t1 = _mm_aesdec_si128(t1, k10);
372 t2 = _mm_aesdec_si128(t2, k10);
373 t3 = _mm_aesdec_si128(t3, k10);
374 t4 = _mm_aesdec_si128(t4, k10);
375 t1 = _mm_aesdec_si128(t1, k11);
376 t2 = _mm_aesdec_si128(t2, k11);
377 t3 = _mm_aesdec_si128(t3, k11);
378 t4 = _mm_aesdec_si128(t4, k11);
379
380 t1 = _mm_aesdeclast_si128(t1, k12);
381 t2 = _mm_aesdeclast_si128(t2, k12);
382 t3 = _mm_aesdeclast_si128(t3, k12);
383 t4 = _mm_aesdeclast_si128(t4, k12);
384 t1 = _mm_xor_si128(t1, f1);
385 t2 = _mm_xor_si128(t2, f2);
386 t3 = _mm_xor_si128(t3, f3);
387 t4 = _mm_xor_si128(t4, f4);
388 _mm_storeu_si128(bo + i + 0, t1);
389 _mm_storeu_si128(bo + i + 1, t2);
390 _mm_storeu_si128(bo + i + 2, t3);
391 _mm_storeu_si128(bo + i + 3, t4);
392 f1 = last;
393 }
394
395 for (i = pblocks; i < blocks; i++)
396 {
397 last = _mm_loadu_si128(bi + i);
398 t1 = _mm_xor_si128(last, k0);
399
400 t1 = _mm_aesdec_si128(t1, k1);
401 t1 = _mm_aesdec_si128(t1, k2);
402 t1 = _mm_aesdec_si128(t1, k3);
403 t1 = _mm_aesdec_si128(t1, k4);
404 t1 = _mm_aesdec_si128(t1, k5);
405 t1 = _mm_aesdec_si128(t1, k6);
406 t1 = _mm_aesdec_si128(t1, k7);
407 t1 = _mm_aesdec_si128(t1, k8);
408 t1 = _mm_aesdec_si128(t1, k9);
409 t1 = _mm_aesdec_si128(t1, k10);
410 t1 = _mm_aesdec_si128(t1, k11);
411
412 t1 = _mm_aesdeclast_si128(t1, k12);
413 t1 = _mm_xor_si128(t1, f1);
414 _mm_storeu_si128(bo + i, t1);
415 f1 = last;
416 }
417 }
418
419 /**
420 * AES-256 CBC encryption
421 */
422 static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
423 u_char *iv, u_char *out)
424 {
425 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
426 __m128i t, fb, *bi, *bo;
427 int i;
428
429 k0 = key->schedule[0];
430 k1 = key->schedule[1];
431 k2 = key->schedule[2];
432 k3 = key->schedule[3];
433 k4 = key->schedule[4];
434 k5 = key->schedule[5];
435 k6 = key->schedule[6];
436 k7 = key->schedule[7];
437 k8 = key->schedule[8];
438 k9 = key->schedule[9];
439 k10 = key->schedule[10];
440 k11 = key->schedule[11];
441 k12 = key->schedule[12];
442 k13 = key->schedule[13];
443 k14 = key->schedule[14];
444
445 bi = (__m128i*)in;
446 bo = (__m128i*)out;
447
448 fb = _mm_loadu_si128((__m128i*)iv);
449 for (i = 0; i < blocks; i++)
450 {
451 t = _mm_loadu_si128(bi + i);
452 fb = _mm_xor_si128(t, fb);
453 fb = _mm_xor_si128(fb, k0);
454
455 fb = _mm_aesenc_si128(fb, k1);
456 fb = _mm_aesenc_si128(fb, k2);
457 fb = _mm_aesenc_si128(fb, k3);
458 fb = _mm_aesenc_si128(fb, k4);
459 fb = _mm_aesenc_si128(fb, k5);
460 fb = _mm_aesenc_si128(fb, k6);
461 fb = _mm_aesenc_si128(fb, k7);
462 fb = _mm_aesenc_si128(fb, k8);
463 fb = _mm_aesenc_si128(fb, k9);
464 fb = _mm_aesenc_si128(fb, k10);
465 fb = _mm_aesenc_si128(fb, k11);
466 fb = _mm_aesenc_si128(fb, k12);
467 fb = _mm_aesenc_si128(fb, k13);
468
469 fb = _mm_aesenclast_si128(fb, k14);
470 _mm_storeu_si128(bo + i, fb);
471 }
472 }
473
474 /**
475 * AES-256 CBC decryption
476 */
477 static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
478 u_char *iv, u_char *out)
479 {
480 __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
481 __m128i last, *bi, *bo;
482 __m128i t1, t2, t3, t4;
483 __m128i f1, f2, f3, f4;
484 u_int i, pblocks;
485
486 k0 = key->schedule[0];
487 k1 = key->schedule[1];
488 k2 = key->schedule[2];
489 k3 = key->schedule[3];
490 k4 = key->schedule[4];
491 k5 = key->schedule[5];
492 k6 = key->schedule[6];
493 k7 = key->schedule[7];
494 k8 = key->schedule[8];
495 k9 = key->schedule[9];
496 k10 = key->schedule[10];
497 k11 = key->schedule[11];
498 k12 = key->schedule[12];
499 k13 = key->schedule[13];
500 k14 = key->schedule[14];
501
502 bi = (__m128i*)in;
503 bo = (__m128i*)out;
504 pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
505
506 f1 = _mm_loadu_si128((__m128i*)iv);
507
508 for (i = 0; i < pblocks; i += CBC_DECRYPT_PARALLELISM)
509 {
510 t1 = _mm_loadu_si128(bi + i + 0);
511 t2 = _mm_loadu_si128(bi + i + 1);
512 t3 = _mm_loadu_si128(bi + i + 2);
513 t4 = _mm_loadu_si128(bi + i + 3);
514
515 f2 = t1;
516 f3 = t2;
517 f4 = t3;
518 last = t4;
519
520 t1 = _mm_xor_si128(t1, k0);
521 t2 = _mm_xor_si128(t2, k0);
522 t3 = _mm_xor_si128(t3, k0);
523 t4 = _mm_xor_si128(t4, k0);
524
525 t1 = _mm_aesdec_si128(t1, k1);
526 t2 = _mm_aesdec_si128(t2, k1);
527 t3 = _mm_aesdec_si128(t3, k1);
528 t4 = _mm_aesdec_si128(t4, k1);
529 t1 = _mm_aesdec_si128(t1, k2);
530 t2 = _mm_aesdec_si128(t2, k2);
531 t3 = _mm_aesdec_si128(t3, k2);
532 t4 = _mm_aesdec_si128(t4, k2);
533 t1 = _mm_aesdec_si128(t1, k3);
534 t2 = _mm_aesdec_si128(t2, k3);
535 t3 = _mm_aesdec_si128(t3, k3);
536 t4 = _mm_aesdec_si128(t4, k3);
537 t1 = _mm_aesdec_si128(t1, k4);
538 t2 = _mm_aesdec_si128(t2, k4);
539 t3 = _mm_aesdec_si128(t3, k4);
540 t4 = _mm_aesdec_si128(t4, k4);
541 t1 = _mm_aesdec_si128(t1, k5);
542 t2 = _mm_aesdec_si128(t2, k5);
543 t3 = _mm_aesdec_si128(t3, k5);
544 t4 = _mm_aesdec_si128(t4, k5);
545 t1 = _mm_aesdec_si128(t1, k6);
546 t2 = _mm_aesdec_si128(t2, k6);
547 t3 = _mm_aesdec_si128(t3, k6);
548 t4 = _mm_aesdec_si128(t4, k6);
549 t1 = _mm_aesdec_si128(t1, k7);
550 t2 = _mm_aesdec_si128(t2, k7);
551 t3 = _mm_aesdec_si128(t3, k7);
552 t4 = _mm_aesdec_si128(t4, k7);
553 t1 = _mm_aesdec_si128(t1, k8);
554 t2 = _mm_aesdec_si128(t2, k8);
555 t3 = _mm_aesdec_si128(t3, k8);
556 t4 = _mm_aesdec_si128(t4, k8);
557 t1 = _mm_aesdec_si128(t1, k9);
558 t2 = _mm_aesdec_si128(t2, k9);
559 t3 = _mm_aesdec_si128(t3, k9);
560 t4 = _mm_aesdec_si128(t4, k9);
561 t1 = _mm_aesdec_si128(t1, k10);
562 t2 = _mm_aesdec_si128(t2, k10);
563 t3 = _mm_aesdec_si128(t3, k10);
564 t4 = _mm_aesdec_si128(t4, k10);
565 t1 = _mm_aesdec_si128(t1, k11);
566 t2 = _mm_aesdec_si128(t2, k11);
567 t3 = _mm_aesdec_si128(t3, k11);
568 t4 = _mm_aesdec_si128(t4, k11);
569 t1 = _mm_aesdec_si128(t1, k12);
570 t2 = _mm_aesdec_si128(t2, k12);
571 t3 = _mm_aesdec_si128(t3, k12);
572 t4 = _mm_aesdec_si128(t4, k12);
573 t1 = _mm_aesdec_si128(t1, k13);
574 t2 = _mm_aesdec_si128(t2, k13);
575 t3 = _mm_aesdec_si128(t3, k13);
576 t4 = _mm_aesdec_si128(t4, k13);
577
578 t1 = _mm_aesdeclast_si128(t1, k14);
579 t2 = _mm_aesdeclast_si128(t2, k14);
580 t3 = _mm_aesdeclast_si128(t3, k14);
581 t4 = _mm_aesdeclast_si128(t4, k14);
582 t1 = _mm_xor_si128(t1, f1);
583 t2 = _mm_xor_si128(t2, f2);
584 t3 = _mm_xor_si128(t3, f3);
585 t4 = _mm_xor_si128(t4, f4);
586 _mm_storeu_si128(bo + i + 0, t1);
587 _mm_storeu_si128(bo + i + 1, t2);
588 _mm_storeu_si128(bo + i + 2, t3);
589 _mm_storeu_si128(bo + i + 3, t4);
590 f1 = last;
591 }
592
593 for (i = pblocks; i < blocks; i++)
594 {
595 last = _mm_loadu_si128(bi + i);
596 t1 = _mm_xor_si128(last, k0);
597
598 t1 = _mm_aesdec_si128(t1, k1);
599 t1 = _mm_aesdec_si128(t1, k2);
600 t1 = _mm_aesdec_si128(t1, k3);
601 t1 = _mm_aesdec_si128(t1, k4);
602 t1 = _mm_aesdec_si128(t1, k5);
603 t1 = _mm_aesdec_si128(t1, k6);
604 t1 = _mm_aesdec_si128(t1, k7);
605 t1 = _mm_aesdec_si128(t1, k8);
606 t1 = _mm_aesdec_si128(t1, k9);
607 t1 = _mm_aesdec_si128(t1, k10);
608 t1 = _mm_aesdec_si128(t1, k11);
609 t1 = _mm_aesdec_si128(t1, k12);
610 t1 = _mm_aesdec_si128(t1, k13);
611
612 t1 = _mm_aesdeclast_si128(t1, k14);
613 t1 = _mm_xor_si128(t1, f1);
614 _mm_storeu_si128(bo + i, t1);
615 f1 = last;
616 }
617 }
618
619 /**
620 * Do inline or allocated de/encryption using key schedule
621 */
622 static bool crypt(aesni_cbc_fn_t fn, aesni_key_t *key,
623 chunk_t data, chunk_t iv, chunk_t *out)
624 {
625 u_char *buf;
626
627 if (!key || iv.len != AES_BLOCK_SIZE || data.len % AES_BLOCK_SIZE)
628 {
629 return FALSE;
630 }
631 if (out)
632 {
633 *out = chunk_alloc(data.len);
634 buf = out->ptr;
635 }
636 else
637 {
638 buf = data.ptr;
639 }
640 fn(key, data.len / AES_BLOCK_SIZE, data.ptr, iv.ptr, buf);
641 return TRUE;
642 }
643
644 METHOD(crypter_t, encrypt, bool,
645 private_aesni_cbc_t *this, chunk_t data, chunk_t iv, chunk_t *encrypted)
646 {
647 return crypt(this->encrypt, this->ekey, data, iv, encrypted);
648 }
649
650 METHOD(crypter_t, decrypt, bool,
651 private_aesni_cbc_t *this, chunk_t data, chunk_t iv, chunk_t *decrypted)
652 {
653 return crypt(this->decrypt, this->dkey, data, iv, decrypted);
654 }
655
656 METHOD(crypter_t, get_block_size, size_t,
657 private_aesni_cbc_t *this)
658 {
659 return AES_BLOCK_SIZE;
660 }
661
662 METHOD(crypter_t, get_iv_size, size_t,
663 private_aesni_cbc_t *this)
664 {
665 return AES_BLOCK_SIZE;
666 }
667
668 METHOD(crypter_t, get_key_size, size_t,
669 private_aesni_cbc_t *this)
670 {
671 return this->key_size;
672 }
673
674 METHOD(crypter_t, set_key, bool,
675 private_aesni_cbc_t *this, chunk_t key)
676 {
677 if (key.len != this->key_size)
678 {
679 return FALSE;
680 }
681
682 DESTROY_IF(this->ekey);
683 DESTROY_IF(this->dkey);
684
685 this->ekey = aesni_key_create(TRUE, key);
686 this->dkey = aesni_key_create(FALSE, key);
687
688 return this->ekey && this->dkey;
689 }
690
691 METHOD(crypter_t, destroy, void,
692 private_aesni_cbc_t *this)
693 {
694 DESTROY_IF(this->ekey);
695 DESTROY_IF(this->dkey);
696 free(this);
697 }
698
699 /**
700 * See header
701 */
702 aesni_cbc_t *aesni_cbc_create(encryption_algorithm_t algo, size_t key_size)
703 {
704 private_aesni_cbc_t *this;
705
706 if (algo != ENCR_AES_CBC)
707 {
708 return NULL;
709 }
710 switch (key_size)
711 {
712 case 0:
713 key_size = 16;
714 break;
715 case 16:
716 case 24:
717 case 32:
718 break;
719 default:
720 return NULL;
721 }
722
723 INIT(this,
724 .public = {
725 .crypter = {
726 .encrypt = _encrypt,
727 .decrypt = _decrypt,
728 .get_block_size = _get_block_size,
729 .get_iv_size = _get_iv_size,
730 .get_key_size = _get_key_size,
731 .set_key = _set_key,
732 .destroy = _destroy,
733 },
734 },
735 .key_size = key_size,
736 );
737
738 switch (key_size)
739 {
740 case 16:
741 this->encrypt = encrypt_cbc128;
742 this->decrypt = decrypt_cbc128;
743 break;
744 case 24:
745 this->encrypt = encrypt_cbc192;
746 this->decrypt = decrypt_cbc192;
747 break;
748 case 32:
749 this->encrypt = encrypt_cbc256;
750 this->decrypt = decrypt_cbc256;
751 break;
752 }
753
754 return &this->public;
755 }