]>
Commit | Line | Data |
---|---|---|
74d43cbd MW |
1 | /* |
2 | * Copyright (C) 2015 Martin Willi | |
3 | * Copyright (C) 2015 revosec AG | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify it | |
6 | * under the terms of the GNU General Public License as published by the | |
7 | * Free Software Foundation; either version 2 of the License, or (at your | |
8 | * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
12 | * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
13 | * for more details. | |
14 | */ | |
15 | ||
16 | #include "aesni_ctr.h" | |
17 | #include "aesni_key.h" | |
18 | ||
19 | #include <tmmintrin.h> | |
20 | ||
8488dea2 MW |
21 | /** |
22 | * Pipeline parallelism we use for CTR en/decryption | |
23 | */ | |
24 | #define CTR_CRYPT_PARALLELISM 4 | |
25 | ||
74d43cbd MW |
26 | typedef struct private_aesni_ctr_t private_aesni_ctr_t; |
27 | ||
28 | /** | |
29 | * CTR en/decryption method type | |
30 | */ | |
31 | typedef void (*aesni_ctr_fn_t)(private_aesni_ctr_t*, size_t, u_char*, u_char*); | |
32 | ||
33 | /** | |
34 | * Private data of an aesni_ctr_t object. | |
35 | */ | |
36 | struct private_aesni_ctr_t { | |
37 | ||
38 | /** | |
39 | * Public aesni_ctr_t interface. | |
40 | */ | |
41 | aesni_ctr_t public; | |
42 | ||
43 | /** | |
44 | * Key size | |
45 | */ | |
46 | u_int key_size; | |
47 | ||
48 | /** | |
49 | * Key schedule | |
50 | */ | |
51 | aesni_key_t *key; | |
52 | ||
53 | /** | |
54 | * Encryption method | |
55 | */ | |
56 | aesni_ctr_fn_t crypt; | |
57 | ||
58 | /** | |
59 | * Counter state | |
60 | */ | |
61 | struct { | |
62 | char nonce[4]; | |
63 | char iv[8]; | |
64 | u_int32_t counter; | |
65 | } __attribute__((packed, aligned(sizeof(__m128i)))) state; | |
66 | }; | |
67 | ||
68 | /** | |
9e47c1fe | 69 | * Do big-endian increment on x |
74d43cbd | 70 | */ |
9e47c1fe | 71 | static inline __m128i increment_be(__m128i x) |
74d43cbd | 72 | { |
9e47c1fe | 73 | __m128i swap; |
74d43cbd | 74 | |
74d43cbd | 75 | swap = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
9e47c1fe MW |
76 | |
77 | x = _mm_shuffle_epi8(x, swap); | |
78 | x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1)); | |
79 | x = _mm_shuffle_epi8(x, swap); | |
80 | ||
81 | return x; | |
82 | } | |
83 | ||
84 | /** | |
85 | * AES-128 CTR encryption | |
86 | */ | |
87 | static void encrypt_ctr128(private_aesni_ctr_t *this, | |
88 | size_t len, u_char *in, u_char *out) | |
89 | { | |
90 | __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10; | |
8488dea2 MW |
91 | __m128i t1, t2, t3, t4; |
92 | __m128i d1, d2, d3, d4; | |
93 | __m128i state, b, *bi, *bo; | |
94 | u_int i, blocks, pblocks, rem; | |
9e47c1fe MW |
95 | |
96 | state = _mm_load_si128((__m128i*)&this->state); | |
97 | blocks = len / AES_BLOCK_SIZE; | |
8488dea2 | 98 | pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM); |
9e47c1fe MW |
99 | rem = len % AES_BLOCK_SIZE; |
100 | bi = (__m128i*)in; | |
101 | bo = (__m128i*)out; | |
102 | ||
103 | k0 = this->key->schedule[0]; | |
104 | k1 = this->key->schedule[1]; | |
105 | k2 = this->key->schedule[2]; | |
106 | k3 = this->key->schedule[3]; | |
107 | k4 = this->key->schedule[4]; | |
108 | k5 = this->key->schedule[5]; | |
109 | k6 = this->key->schedule[6]; | |
110 | k7 = this->key->schedule[7]; | |
111 | k8 = this->key->schedule[8]; | |
112 | k9 = this->key->schedule[9]; | |
113 | k10 = this->key->schedule[10]; | |
114 | ||
8488dea2 | 115 | for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM) |
9e47c1fe | 116 | { |
8488dea2 MW |
117 | d1 = _mm_loadu_si128(bi + i + 0); |
118 | d2 = _mm_loadu_si128(bi + i + 1); | |
119 | d3 = _mm_loadu_si128(bi + i + 2); | |
120 | d4 = _mm_loadu_si128(bi + i + 3); | |
9e47c1fe | 121 | |
8488dea2 MW |
122 | t1 = _mm_xor_si128(state, k0); |
123 | state = increment_be(state); | |
124 | t2 = _mm_xor_si128(state, k0); | |
125 | state = increment_be(state); | |
126 | t3 = _mm_xor_si128(state, k0); | |
9e47c1fe | 127 | state = increment_be(state); |
8488dea2 MW |
128 | t4 = _mm_xor_si128(state, k0); |
129 | state = increment_be(state); | |
130 | ||
131 | t1 = _mm_aesenc_si128(t1, k1); | |
132 | t2 = _mm_aesenc_si128(t2, k1); | |
133 | t3 = _mm_aesenc_si128(t3, k1); | |
134 | t4 = _mm_aesenc_si128(t4, k1); | |
135 | t1 = _mm_aesenc_si128(t1, k2); | |
136 | t2 = _mm_aesenc_si128(t2, k2); | |
137 | t3 = _mm_aesenc_si128(t3, k2); | |
138 | t4 = _mm_aesenc_si128(t4, k2); | |
139 | t1 = _mm_aesenc_si128(t1, k3); | |
140 | t2 = _mm_aesenc_si128(t2, k3); | |
141 | t3 = _mm_aesenc_si128(t3, k3); | |
142 | t4 = _mm_aesenc_si128(t4, k3); | |
143 | t1 = _mm_aesenc_si128(t1, k4); | |
144 | t2 = _mm_aesenc_si128(t2, k4); | |
145 | t3 = _mm_aesenc_si128(t3, k4); | |
146 | t4 = _mm_aesenc_si128(t4, k4); | |
147 | t1 = _mm_aesenc_si128(t1, k5); | |
148 | t2 = _mm_aesenc_si128(t2, k5); | |
149 | t3 = _mm_aesenc_si128(t3, k5); | |
150 | t4 = _mm_aesenc_si128(t4, k5); | |
151 | t1 = _mm_aesenc_si128(t1, k6); | |
152 | t2 = _mm_aesenc_si128(t2, k6); | |
153 | t3 = _mm_aesenc_si128(t3, k6); | |
154 | t4 = _mm_aesenc_si128(t4, k6); | |
155 | t1 = _mm_aesenc_si128(t1, k7); | |
156 | t2 = _mm_aesenc_si128(t2, k7); | |
157 | t3 = _mm_aesenc_si128(t3, k7); | |
158 | t4 = _mm_aesenc_si128(t4, k7); | |
159 | t1 = _mm_aesenc_si128(t1, k8); | |
160 | t2 = _mm_aesenc_si128(t2, k8); | |
161 | t3 = _mm_aesenc_si128(t3, k8); | |
162 | t4 = _mm_aesenc_si128(t4, k8); | |
163 | t1 = _mm_aesenc_si128(t1, k9); | |
164 | t2 = _mm_aesenc_si128(t2, k9); | |
165 | t3 = _mm_aesenc_si128(t3, k9); | |
166 | t4 = _mm_aesenc_si128(t4, k9); | |
167 | ||
168 | t1 = _mm_aesenclast_si128(t1, k10); | |
169 | t2 = _mm_aesenclast_si128(t2, k10); | |
170 | t3 = _mm_aesenclast_si128(t3, k10); | |
171 | t4 = _mm_aesenclast_si128(t4, k10); | |
172 | t1 = _mm_xor_si128(t1, d1); | |
173 | t2 = _mm_xor_si128(t2, d2); | |
174 | t3 = _mm_xor_si128(t3, d3); | |
175 | t4 = _mm_xor_si128(t4, d4); | |
176 | _mm_storeu_si128(bo + i + 0, t1); | |
177 | _mm_storeu_si128(bo + i + 1, t2); | |
178 | _mm_storeu_si128(bo + i + 2, t3); | |
179 | _mm_storeu_si128(bo + i + 3, t4); | |
180 | } | |
181 | ||
182 | for (i = pblocks; i < blocks; i++) | |
183 | { | |
184 | d1 = _mm_loadu_si128(bi + i); | |
185 | ||
186 | t1 = _mm_xor_si128(state, k0); | |
187 | state = increment_be(state); | |
188 | ||
189 | t1 = _mm_aesenc_si128(t1, k1); | |
190 | t1 = _mm_aesenc_si128(t1, k2); | |
191 | t1 = _mm_aesenc_si128(t1, k3); | |
192 | t1 = _mm_aesenc_si128(t1, k4); | |
193 | t1 = _mm_aesenc_si128(t1, k5); | |
194 | t1 = _mm_aesenc_si128(t1, k6); | |
195 | t1 = _mm_aesenc_si128(t1, k7); | |
196 | t1 = _mm_aesenc_si128(t1, k8); | |
197 | t1 = _mm_aesenc_si128(t1, k9); | |
198 | ||
199 | t1 = _mm_aesenclast_si128(t1, k10); | |
200 | t1 = _mm_xor_si128(t1, d1); | |
201 | _mm_storeu_si128(bo + i, t1); | |
9e47c1fe MW |
202 | } |
203 | ||
204 | if (rem) | |
205 | { | |
206 | memset(&b, 0, sizeof(b)); | |
207 | memcpy(&b, bi + blocks, rem); | |
208 | ||
8488dea2 MW |
209 | d1 = _mm_loadu_si128(&b); |
210 | t1 = _mm_xor_si128(state, k0); | |
9e47c1fe | 211 | |
8488dea2 MW |
212 | t1 = _mm_aesenc_si128(t1, k1); |
213 | t1 = _mm_aesenc_si128(t1, k2); | |
214 | t1 = _mm_aesenc_si128(t1, k3); | |
215 | t1 = _mm_aesenc_si128(t1, k4); | |
216 | t1 = _mm_aesenc_si128(t1, k5); | |
217 | t1 = _mm_aesenc_si128(t1, k6); | |
218 | t1 = _mm_aesenc_si128(t1, k7); | |
219 | t1 = _mm_aesenc_si128(t1, k8); | |
220 | t1 = _mm_aesenc_si128(t1, k9); | |
9e47c1fe | 221 | |
8488dea2 MW |
222 | t1 = _mm_aesenclast_si128(t1, k10); |
223 | t1 = _mm_xor_si128(t1, d1); | |
224 | _mm_storeu_si128(&b, t1); | |
9e47c1fe MW |
225 | |
226 | memcpy(bo + blocks, &b, rem); | |
227 | } | |
228 | } | |
229 | ||
230 | /** | |
231 | * AES-192 CTR encryption | |
232 | */ | |
233 | static void encrypt_ctr192(private_aesni_ctr_t *this, | |
234 | size_t len, u_char *in, u_char *out) | |
235 | { | |
236 | __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12; | |
8488dea2 MW |
237 | __m128i t1, t2, t3, t4; |
238 | __m128i d1, d2, d3, d4; | |
239 | __m128i state, b, *bi, *bo; | |
240 | u_int i, blocks, pblocks, rem; | |
9e47c1fe | 241 | |
74d43cbd MW |
242 | state = _mm_load_si128((__m128i*)&this->state); |
243 | blocks = len / AES_BLOCK_SIZE; | |
8488dea2 | 244 | pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM); |
74d43cbd MW |
245 | rem = len % AES_BLOCK_SIZE; |
246 | bi = (__m128i*)in; | |
247 | bo = (__m128i*)out; | |
248 | ||
9e47c1fe MW |
249 | k0 = this->key->schedule[0]; |
250 | k1 = this->key->schedule[1]; | |
251 | k2 = this->key->schedule[2]; | |
252 | k3 = this->key->schedule[3]; | |
253 | k4 = this->key->schedule[4]; | |
254 | k5 = this->key->schedule[5]; | |
255 | k6 = this->key->schedule[6]; | |
256 | k7 = this->key->schedule[7]; | |
257 | k8 = this->key->schedule[8]; | |
258 | k9 = this->key->schedule[9]; | |
259 | k10 = this->key->schedule[10]; | |
260 | k11 = this->key->schedule[11]; | |
261 | k12 = this->key->schedule[12]; | |
262 | ||
8488dea2 | 263 | for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM) |
74d43cbd | 264 | { |
8488dea2 MW |
265 | d1 = _mm_loadu_si128(bi + i + 0); |
266 | d2 = _mm_loadu_si128(bi + i + 1); | |
267 | d3 = _mm_loadu_si128(bi + i + 2); | |
268 | d4 = _mm_loadu_si128(bi + i + 3); | |
74d43cbd | 269 | |
8488dea2 | 270 | t1 = _mm_xor_si128(state, k0); |
9e47c1fe | 271 | state = increment_be(state); |
8488dea2 MW |
272 | t2 = _mm_xor_si128(state, k0); |
273 | state = increment_be(state); | |
274 | t3 = _mm_xor_si128(state, k0); | |
275 | state = increment_be(state); | |
276 | t4 = _mm_xor_si128(state, k0); | |
277 | state = increment_be(state); | |
278 | ||
279 | t1 = _mm_aesenc_si128(t1, k1); | |
280 | t2 = _mm_aesenc_si128(t2, k1); | |
281 | t3 = _mm_aesenc_si128(t3, k1); | |
282 | t4 = _mm_aesenc_si128(t4, k1); | |
283 | t1 = _mm_aesenc_si128(t1, k2); | |
284 | t2 = _mm_aesenc_si128(t2, k2); | |
285 | t3 = _mm_aesenc_si128(t3, k2); | |
286 | t4 = _mm_aesenc_si128(t4, k2); | |
287 | t1 = _mm_aesenc_si128(t1, k3); | |
288 | t2 = _mm_aesenc_si128(t2, k3); | |
289 | t3 = _mm_aesenc_si128(t3, k3); | |
290 | t4 = _mm_aesenc_si128(t4, k3); | |
291 | t1 = _mm_aesenc_si128(t1, k4); | |
292 | t2 = _mm_aesenc_si128(t2, k4); | |
293 | t3 = _mm_aesenc_si128(t3, k4); | |
294 | t4 = _mm_aesenc_si128(t4, k4); | |
295 | t1 = _mm_aesenc_si128(t1, k5); | |
296 | t2 = _mm_aesenc_si128(t2, k5); | |
297 | t3 = _mm_aesenc_si128(t3, k5); | |
298 | t4 = _mm_aesenc_si128(t4, k5); | |
299 | t1 = _mm_aesenc_si128(t1, k6); | |
300 | t2 = _mm_aesenc_si128(t2, k6); | |
301 | t3 = _mm_aesenc_si128(t3, k6); | |
302 | t4 = _mm_aesenc_si128(t4, k6); | |
303 | t1 = _mm_aesenc_si128(t1, k7); | |
304 | t2 = _mm_aesenc_si128(t2, k7); | |
305 | t3 = _mm_aesenc_si128(t3, k7); | |
306 | t4 = _mm_aesenc_si128(t4, k7); | |
307 | t1 = _mm_aesenc_si128(t1, k8); | |
308 | t2 = _mm_aesenc_si128(t2, k8); | |
309 | t3 = _mm_aesenc_si128(t3, k8); | |
310 | t4 = _mm_aesenc_si128(t4, k8); | |
311 | t1 = _mm_aesenc_si128(t1, k9); | |
312 | t2 = _mm_aesenc_si128(t2, k9); | |
313 | t3 = _mm_aesenc_si128(t3, k9); | |
314 | t4 = _mm_aesenc_si128(t4, k9); | |
315 | t1 = _mm_aesenc_si128(t1, k10); | |
316 | t2 = _mm_aesenc_si128(t2, k10); | |
317 | t3 = _mm_aesenc_si128(t3, k10); | |
318 | t4 = _mm_aesenc_si128(t4, k10); | |
319 | t1 = _mm_aesenc_si128(t1, k11); | |
320 | t2 = _mm_aesenc_si128(t2, k11); | |
321 | t3 = _mm_aesenc_si128(t3, k11); | |
322 | t4 = _mm_aesenc_si128(t4, k11); | |
323 | ||
324 | t1 = _mm_aesenclast_si128(t1, k12); | |
325 | t2 = _mm_aesenclast_si128(t2, k12); | |
326 | t3 = _mm_aesenclast_si128(t3, k12); | |
327 | t4 = _mm_aesenclast_si128(t4, k12); | |
328 | t1 = _mm_xor_si128(t1, d1); | |
329 | t2 = _mm_xor_si128(t2, d2); | |
330 | t3 = _mm_xor_si128(t3, d3); | |
331 | t4 = _mm_xor_si128(t4, d4); | |
332 | _mm_storeu_si128(bo + i + 0, t1); | |
333 | _mm_storeu_si128(bo + i + 1, t2); | |
334 | _mm_storeu_si128(bo + i + 2, t3); | |
335 | _mm_storeu_si128(bo + i + 3, t4); | |
336 | } | |
337 | ||
338 | for (i = pblocks; i < blocks; i++) | |
339 | { | |
340 | d1 = _mm_loadu_si128(bi + i); | |
341 | ||
342 | t1 = _mm_xor_si128(state, k0); | |
343 | state = increment_be(state); | |
344 | ||
345 | t1 = _mm_aesenc_si128(t1, k1); | |
346 | t1 = _mm_aesenc_si128(t1, k2); | |
347 | t1 = _mm_aesenc_si128(t1, k3); | |
348 | t1 = _mm_aesenc_si128(t1, k4); | |
349 | t1 = _mm_aesenc_si128(t1, k5); | |
350 | t1 = _mm_aesenc_si128(t1, k6); | |
351 | t1 = _mm_aesenc_si128(t1, k7); | |
352 | t1 = _mm_aesenc_si128(t1, k8); | |
353 | t1 = _mm_aesenc_si128(t1, k9); | |
354 | t1 = _mm_aesenc_si128(t1, k10); | |
355 | t1 = _mm_aesenc_si128(t1, k11); | |
356 | ||
357 | t1 = _mm_aesenclast_si128(t1, k12); | |
358 | t1 = _mm_xor_si128(t1, d1); | |
359 | _mm_storeu_si128(bo + i, t1); | |
74d43cbd MW |
360 | } |
361 | ||
362 | if (rem) | |
363 | { | |
364 | memset(&b, 0, sizeof(b)); | |
365 | memcpy(&b, bi + blocks, rem); | |
366 | ||
8488dea2 MW |
367 | d1 = _mm_loadu_si128(&b); |
368 | t1 = _mm_xor_si128(state, k0); | |
369 | ||
370 | t1 = _mm_aesenc_si128(t1, k1); | |
371 | t1 = _mm_aesenc_si128(t1, k2); | |
372 | t1 = _mm_aesenc_si128(t1, k3); | |
373 | t1 = _mm_aesenc_si128(t1, k4); | |
374 | t1 = _mm_aesenc_si128(t1, k5); | |
375 | t1 = _mm_aesenc_si128(t1, k6); | |
376 | t1 = _mm_aesenc_si128(t1, k7); | |
377 | t1 = _mm_aesenc_si128(t1, k8); | |
378 | t1 = _mm_aesenc_si128(t1, k9); | |
379 | t1 = _mm_aesenc_si128(t1, k10); | |
380 | t1 = _mm_aesenc_si128(t1, k11); | |
381 | ||
382 | t1 = _mm_aesenclast_si128(t1, k12); | |
383 | t1 = _mm_xor_si128(t1, d1); | |
384 | _mm_storeu_si128(&b, t1); | |
9e47c1fe MW |
385 | |
386 | memcpy(bo + blocks, &b, rem); | |
387 | } | |
388 | } | |
389 | ||
390 | /** | |
391 | * AES-256 CTR encryption | |
392 | */ | |
393 | static void encrypt_ctr256(private_aesni_ctr_t *this, | |
394 | size_t len, u_char *in, u_char *out) | |
395 | { | |
396 | __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14; | |
8488dea2 MW |
397 | __m128i t1, t2, t3, t4; |
398 | __m128i d1, d2, d3, d4; | |
399 | __m128i state, b, *bi, *bo; | |
400 | u_int i, blocks, pblocks, rem; | |
9e47c1fe MW |
401 | |
402 | state = _mm_load_si128((__m128i*)&this->state); | |
403 | blocks = len / AES_BLOCK_SIZE; | |
8488dea2 | 404 | pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM); |
9e47c1fe MW |
405 | rem = len % AES_BLOCK_SIZE; |
406 | bi = (__m128i*)in; | |
407 | bo = (__m128i*)out; | |
408 | ||
409 | k0 = this->key->schedule[0]; | |
410 | k1 = this->key->schedule[1]; | |
411 | k2 = this->key->schedule[2]; | |
412 | k3 = this->key->schedule[3]; | |
413 | k4 = this->key->schedule[4]; | |
414 | k5 = this->key->schedule[5]; | |
415 | k6 = this->key->schedule[6]; | |
416 | k7 = this->key->schedule[7]; | |
417 | k8 = this->key->schedule[8]; | |
418 | k9 = this->key->schedule[9]; | |
419 | k10 = this->key->schedule[10]; | |
420 | k11 = this->key->schedule[11]; | |
421 | k12 = this->key->schedule[12]; | |
422 | k13 = this->key->schedule[13]; | |
423 | k14 = this->key->schedule[14]; | |
424 | ||
8488dea2 MW |
425 | for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM) |
426 | { | |
427 | d1 = _mm_loadu_si128(bi + i + 0); | |
428 | d2 = _mm_loadu_si128(bi + i + 1); | |
429 | d3 = _mm_loadu_si128(bi + i + 2); | |
430 | d4 = _mm_loadu_si128(bi + i + 3); | |
431 | ||
432 | t1 = _mm_xor_si128(state, k0); | |
433 | state = increment_be(state); | |
434 | t2 = _mm_xor_si128(state, k0); | |
435 | state = increment_be(state); | |
436 | t3 = _mm_xor_si128(state, k0); | |
437 | state = increment_be(state); | |
438 | t4 = _mm_xor_si128(state, k0); | |
439 | state = increment_be(state); | |
440 | ||
441 | t1 = _mm_aesenc_si128(t1, k1); | |
442 | t2 = _mm_aesenc_si128(t2, k1); | |
443 | t3 = _mm_aesenc_si128(t3, k1); | |
444 | t4 = _mm_aesenc_si128(t4, k1); | |
445 | t1 = _mm_aesenc_si128(t1, k2); | |
446 | t2 = _mm_aesenc_si128(t2, k2); | |
447 | t3 = _mm_aesenc_si128(t3, k2); | |
448 | t4 = _mm_aesenc_si128(t4, k2); | |
449 | t1 = _mm_aesenc_si128(t1, k3); | |
450 | t2 = _mm_aesenc_si128(t2, k3); | |
451 | t3 = _mm_aesenc_si128(t3, k3); | |
452 | t4 = _mm_aesenc_si128(t4, k3); | |
453 | t1 = _mm_aesenc_si128(t1, k4); | |
454 | t2 = _mm_aesenc_si128(t2, k4); | |
455 | t3 = _mm_aesenc_si128(t3, k4); | |
456 | t4 = _mm_aesenc_si128(t4, k4); | |
457 | t1 = _mm_aesenc_si128(t1, k5); | |
458 | t2 = _mm_aesenc_si128(t2, k5); | |
459 | t3 = _mm_aesenc_si128(t3, k5); | |
460 | t4 = _mm_aesenc_si128(t4, k5); | |
461 | t1 = _mm_aesenc_si128(t1, k6); | |
462 | t2 = _mm_aesenc_si128(t2, k6); | |
463 | t3 = _mm_aesenc_si128(t3, k6); | |
464 | t4 = _mm_aesenc_si128(t4, k6); | |
465 | t1 = _mm_aesenc_si128(t1, k7); | |
466 | t2 = _mm_aesenc_si128(t2, k7); | |
467 | t3 = _mm_aesenc_si128(t3, k7); | |
468 | t4 = _mm_aesenc_si128(t4, k7); | |
469 | t1 = _mm_aesenc_si128(t1, k8); | |
470 | t2 = _mm_aesenc_si128(t2, k8); | |
471 | t3 = _mm_aesenc_si128(t3, k8); | |
472 | t4 = _mm_aesenc_si128(t4, k8); | |
473 | t1 = _mm_aesenc_si128(t1, k9); | |
474 | t2 = _mm_aesenc_si128(t2, k9); | |
475 | t3 = _mm_aesenc_si128(t3, k9); | |
476 | t4 = _mm_aesenc_si128(t4, k9); | |
477 | t1 = _mm_aesenc_si128(t1, k10); | |
478 | t2 = _mm_aesenc_si128(t2, k10); | |
479 | t3 = _mm_aesenc_si128(t3, k10); | |
480 | t4 = _mm_aesenc_si128(t4, k10); | |
481 | t1 = _mm_aesenc_si128(t1, k11); | |
482 | t2 = _mm_aesenc_si128(t2, k11); | |
483 | t3 = _mm_aesenc_si128(t3, k11); | |
484 | t4 = _mm_aesenc_si128(t4, k11); | |
485 | t1 = _mm_aesenc_si128(t1, k12); | |
486 | t2 = _mm_aesenc_si128(t2, k12); | |
487 | t3 = _mm_aesenc_si128(t3, k12); | |
488 | t4 = _mm_aesenc_si128(t4, k12); | |
489 | t1 = _mm_aesenc_si128(t1, k13); | |
490 | t2 = _mm_aesenc_si128(t2, k13); | |
491 | t3 = _mm_aesenc_si128(t3, k13); | |
492 | t4 = _mm_aesenc_si128(t4, k13); | |
493 | ||
494 | t1 = _mm_aesenclast_si128(t1, k14); | |
495 | t2 = _mm_aesenclast_si128(t2, k14); | |
496 | t3 = _mm_aesenclast_si128(t3, k14); | |
497 | t4 = _mm_aesenclast_si128(t4, k14); | |
498 | t1 = _mm_xor_si128(t1, d1); | |
499 | t2 = _mm_xor_si128(t2, d2); | |
500 | t3 = _mm_xor_si128(t3, d3); | |
501 | t4 = _mm_xor_si128(t4, d4); | |
502 | _mm_storeu_si128(bo + i + 0, t1); | |
503 | _mm_storeu_si128(bo + i + 1, t2); | |
504 | _mm_storeu_si128(bo + i + 2, t3); | |
505 | _mm_storeu_si128(bo + i + 3, t4); | |
506 | } | |
507 | ||
508 | for (i = pblocks; i < blocks; i++) | |
9e47c1fe | 509 | { |
8488dea2 | 510 | d1 = _mm_loadu_si128(bi + i); |
9e47c1fe | 511 | |
8488dea2 | 512 | t1 = _mm_xor_si128(state, k0); |
9e47c1fe | 513 | state = increment_be(state); |
8488dea2 MW |
514 | |
515 | t1 = _mm_aesenc_si128(t1, k1); | |
516 | t1 = _mm_aesenc_si128(t1, k2); | |
517 | t1 = _mm_aesenc_si128(t1, k3); | |
518 | t1 = _mm_aesenc_si128(t1, k4); | |
519 | t1 = _mm_aesenc_si128(t1, k5); | |
520 | t1 = _mm_aesenc_si128(t1, k6); | |
521 | t1 = _mm_aesenc_si128(t1, k7); | |
522 | t1 = _mm_aesenc_si128(t1, k8); | |
523 | t1 = _mm_aesenc_si128(t1, k9); | |
524 | t1 = _mm_aesenc_si128(t1, k10); | |
525 | t1 = _mm_aesenc_si128(t1, k11); | |
526 | t1 = _mm_aesenc_si128(t1, k12); | |
527 | t1 = _mm_aesenc_si128(t1, k13); | |
528 | ||
529 | t1 = _mm_aesenclast_si128(t1, k14); | |
530 | t1 = _mm_xor_si128(t1, d1); | |
531 | _mm_storeu_si128(bo + i, t1); | |
9e47c1fe MW |
532 | } |
533 | ||
534 | if (rem) | |
535 | { | |
536 | memset(&b, 0, sizeof(b)); | |
537 | memcpy(&b, bi + blocks, rem); | |
538 | ||
8488dea2 MW |
539 | d1 = _mm_loadu_si128(&b); |
540 | t1 = _mm_xor_si128(state, k0); | |
541 | ||
542 | t1 = _mm_aesenc_si128(t1, k1); | |
543 | t1 = _mm_aesenc_si128(t1, k2); | |
544 | t1 = _mm_aesenc_si128(t1, k3); | |
545 | t1 = _mm_aesenc_si128(t1, k4); | |
546 | t1 = _mm_aesenc_si128(t1, k5); | |
547 | t1 = _mm_aesenc_si128(t1, k6); | |
548 | t1 = _mm_aesenc_si128(t1, k7); | |
549 | t1 = _mm_aesenc_si128(t1, k8); | |
550 | t1 = _mm_aesenc_si128(t1, k9); | |
551 | t1 = _mm_aesenc_si128(t1, k10); | |
552 | t1 = _mm_aesenc_si128(t1, k11); | |
553 | t1 = _mm_aesenc_si128(t1, k12); | |
554 | t1 = _mm_aesenc_si128(t1, k13); | |
555 | ||
556 | t1 = _mm_aesenclast_si128(t1, k14); | |
557 | t1 = _mm_xor_si128(t1, d1); | |
558 | _mm_storeu_si128(&b, t1); | |
74d43cbd MW |
559 | |
560 | memcpy(bo + blocks, &b, rem); | |
561 | } | |
562 | } | |
563 | ||
564 | METHOD(crypter_t, crypt, bool, | |
565 | private_aesni_ctr_t *this, chunk_t in, chunk_t iv, chunk_t *out) | |
566 | { | |
567 | u_char *buf; | |
568 | ||
569 | if (!this->key || iv.len != sizeof(this->state.iv)) | |
570 | { | |
571 | return FALSE; | |
572 | } | |
573 | memcpy(this->state.iv, iv.ptr, sizeof(this->state.iv)); | |
574 | this->state.counter = htonl(1); | |
575 | ||
576 | buf = in.ptr; | |
577 | if (out) | |
578 | { | |
579 | *out = chunk_alloc(in.len); | |
580 | buf = out->ptr; | |
581 | } | |
582 | this->crypt(this, in.len, in.ptr, buf); | |
583 | return TRUE; | |
584 | } | |
585 | ||
586 | METHOD(crypter_t, get_block_size, size_t, | |
587 | private_aesni_ctr_t *this) | |
588 | { | |
589 | return 1; | |
590 | } | |
591 | ||
592 | METHOD(crypter_t, get_iv_size, size_t, | |
593 | private_aesni_ctr_t *this) | |
594 | { | |
595 | return sizeof(this->state.iv); | |
596 | } | |
597 | ||
598 | METHOD(crypter_t, get_key_size, size_t, | |
599 | private_aesni_ctr_t *this) | |
600 | { | |
601 | return this->key_size + sizeof(this->state.nonce); | |
602 | } | |
603 | ||
604 | METHOD(crypter_t, set_key, bool, | |
605 | private_aesni_ctr_t *this, chunk_t key) | |
606 | { | |
607 | if (key.len != get_key_size(this)) | |
608 | { | |
609 | return FALSE; | |
610 | } | |
611 | ||
612 | memcpy(this->state.nonce, key.ptr + key.len - sizeof(this->state.nonce), | |
613 | sizeof(this->state.nonce)); | |
614 | key.len -= sizeof(this->state.nonce); | |
615 | ||
616 | DESTROY_IF(this->key); | |
617 | this->key = aesni_key_create(TRUE, key); | |
618 | ||
619 | return this->key; | |
620 | } | |
621 | ||
622 | METHOD(crypter_t, destroy, void, | |
623 | private_aesni_ctr_t *this) | |
624 | { | |
625 | DESTROY_IF(this->key); | |
626 | free(this); | |
627 | } | |
628 | ||
629 | /** | |
630 | * See header | |
631 | */ | |
632 | aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size) | |
633 | { | |
634 | private_aesni_ctr_t *this; | |
635 | ||
636 | if (algo != ENCR_AES_CTR) | |
637 | { | |
638 | return NULL; | |
639 | } | |
640 | switch (key_size) | |
641 | { | |
642 | case 0: | |
643 | key_size = 16; | |
644 | break; | |
645 | case 16: | |
646 | case 24: | |
647 | case 32: | |
648 | break; | |
649 | default: | |
650 | return NULL; | |
651 | } | |
652 | ||
653 | INIT(this, | |
654 | .public = { | |
655 | .crypter = { | |
656 | .encrypt = _crypt, | |
657 | .decrypt = _crypt, | |
658 | .get_block_size = _get_block_size, | |
659 | .get_iv_size = _get_iv_size, | |
660 | .get_key_size = _get_key_size, | |
661 | .set_key = _set_key, | |
662 | .destroy = _destroy, | |
663 | }, | |
664 | }, | |
665 | .key_size = key_size, | |
74d43cbd MW |
666 | ); |
667 | ||
9e47c1fe MW |
668 | switch (key_size) |
669 | { | |
670 | case 16: | |
671 | this->crypt = encrypt_ctr128; | |
672 | break; | |
673 | case 24: | |
674 | this->crypt = encrypt_ctr192; | |
675 | break; | |
676 | case 32: | |
677 | this->crypt = encrypt_ctr256; | |
678 | break; | |
679 | } | |
680 | ||
74d43cbd MW |
681 | return &this->public; |
682 | } |