]>
Commit | Line | Data |
---|---|---|
ed7a7f77 AF |
1 | From bcdbd313c0e6fd630a8945fd58dc5383631dc6dd Mon Sep 17 00:00:00 2001 |
2 | From: Timothy McCaffrey <timothy.mccaffrey@unisys.com> | |
3 | Date: Tue, 13 Jan 2015 13:16:43 -0500 | |
4 | Subject: [PATCH] crypto: aesni - Add support for 192 & 256 bit keys to AESNI | |
5 | RFC4106 | |
6 | ||
7 | These patches fix the RFC4106 implementation in the aesni-intel | |
8 | module so it supports 192 & 256 bit keys. | |
9 | ||
10 | Since the AVX support that was added to this module also only | |
11 | supports 128 bit keys, and this patch only affects the SSE | |
12 | implementation, changes were also made to use the SSE version | |
13 | if key sizes other than 128 are specified. | |
14 | ||
15 | RFC4106 specifies that 192 & 256 bit keys must be supported (section | |
16 | 8.4). | |
17 | ||
18 | Also, this should fix Strongswan issue 341 where the aesni module | |
19 | needs to be unloaded if 256 bit keys are used: | |
20 | ||
21 | http://wiki.strongswan.org/issues/341 | |
22 | ||
23 | This patch has been tested with Sandy Bridge and Haswell processors. | |
24 | With 128 bit keys and input buffers > 512 bytes a slight performance | |
25 | degradation was noticed (~1%). For input buffers of less than 512 | |
26 | bytes there was no performance impact. Compared to 128 bit keys, | |
27 | 256 bit key size performance is approx. .5 cycles per byte slower | |
28 | on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs. | |
29 | SSE code). | |
30 | ||
31 | This patch has also been tested with StrongSwan IPSec connections | |
32 | where it worked correctly. | |
33 | ||
34 | I created this diff from a git clone of crypto-2.6.git. | |
35 | ||
36 | Any questions, please feel free to contact me. | |
37 | ||
38 | Signed-off-by: Timothy McCaffrey <timothy.mccaffrey@unisys.com> | |
39 | Signed-off-by: Jarod Wilson <jarod@redhat.com> | |
40 | Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> | |
41 | --- | |
42 | arch/x86/crypto/aesni-intel_asm.S | 342 +++++++++++++++++++------------------ | |
43 | arch/x86/crypto/aesni-intel_glue.c | 31 +++- | |
44 | 2 files changed, 202 insertions(+), 171 deletions(-) | |
45 | ||
46 | diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S | |
47 | index c92c7d8..f5cdfbf 100644 | |
48 | --- a/arch/x86/crypto/aesni-intel_asm.S | |
49 | +++ b/arch/x86/crypto/aesni-intel_asm.S | |
50 | @@ -33,12 +33,23 @@ | |
51 | #include <asm/inst.h> | |
52 | #include <asm/alternative-asm.h> | |
53 | ||
54 | +/* | |
55 | + * The following macros are used to move an (un)aligned 16 byte value to/from | |
56 | + * an XMM register. This can done for either FP or integer values, for FP use | |
57 | + * movaps (move aligned packed single) or integer use movdqa (move double quad | |
58 | + * aligned). It doesn't make a performance difference which instruction is used | |
59 | + * since Nehalem (original Core i7) was released. However, the movaps is a byte | |
60 | + * shorter, so that is the one we'll use for now. (same for unaligned). | |
61 | + */ | |
62 | +#define MOVADQ movaps | |
63 | +#define MOVUDQ movups | |
64 | + | |
65 | #ifdef __x86_64__ | |
66 | + | |
67 | .data | |
68 | .align 16 | |
69 | .Lgf128mul_x_ble_mask: | |
70 | .octa 0x00000000000000010000000000000087 | |
71 | - | |
72 | POLY: .octa 0xC2000000000000000000000000000001 | |
73 | TWOONE: .octa 0x00000001000000000000000000000001 | |
74 | ||
75 | @@ -90,6 +101,7 @@ enc: .octa 0x2 | |
76 | #define arg8 STACK_OFFSET+16(%r14) | |
77 | #define arg9 STACK_OFFSET+24(%r14) | |
78 | #define arg10 STACK_OFFSET+32(%r14) | |
79 | +#define keysize 2*15*16(%arg1) | |
80 | #endif | |
81 | ||
82 | ||
83 | @@ -214,10 +226,12 @@ enc: .octa 0x2 | |
84 | ||
85 | .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | |
86 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | |
87 | + MOVADQ SHUF_MASK(%rip), %xmm14 | |
88 | mov arg7, %r10 # %r10 = AAD | |
89 | mov arg8, %r15 # %r15 = aadLen | |
90 | mov %r15, %r11 | |
91 | pxor %xmm\i, %xmm\i | |
92 | + | |
93 | _get_AAD_loop\num_initial_blocks\operation: | |
94 | movd (%r10), \TMP1 | |
95 | pslldq $12, \TMP1 | |
96 | @@ -226,6 +240,7 @@ _get_AAD_loop\num_initial_blocks\operation: | |
97 | add $4, %r10 | |
98 | sub $4, %r15 | |
99 | jne _get_AAD_loop\num_initial_blocks\operation | |
100 | + | |
101 | cmp $16, %r11 | |
102 | je _get_AAD_loop2_done\num_initial_blocks\operation | |
103 | mov $16, %r15 | |
104 | @@ -234,8 +249,8 @@ _get_AAD_loop2\num_initial_blocks\operation: | |
105 | sub $4, %r15 | |
106 | cmp %r11, %r15 | |
107 | jne _get_AAD_loop2\num_initial_blocks\operation | |
108 | + | |
109 | _get_AAD_loop2_done\num_initial_blocks\operation: | |
110 | - movdqa SHUF_MASK(%rip), %xmm14 | |
111 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | |
112 | ||
113 | xor %r11, %r11 # initialise the data pointer offset as zero | |
114 | @@ -244,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
115 | ||
116 | mov %arg5, %rax # %rax = *Y0 | |
117 | movdqu (%rax), \XMM0 # XMM0 = Y0 | |
118 | - movdqa SHUF_MASK(%rip), %xmm14 | |
119 | PSHUFB_XMM %xmm14, \XMM0 | |
120 | ||
121 | .if (\i == 5) || (\i == 6) || (\i == 7) | |
122 | + MOVADQ ONE(%RIP),\TMP1 | |
123 | + MOVADQ (%arg1),\TMP2 | |
124 | .irpc index, \i_seq | |
125 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
126 | + paddd \TMP1, \XMM0 # INCR Y0 | |
127 | movdqa \XMM0, %xmm\index | |
128 | - movdqa SHUF_MASK(%rip), %xmm14 | |
129 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | |
130 | - | |
131 | -.endr | |
132 | -.irpc index, \i_seq | |
133 | - pxor 16*0(%arg1), %xmm\index | |
134 | -.endr | |
135 | -.irpc index, \i_seq | |
136 | - movaps 0x10(%rdi), \TMP1 | |
137 | - AESENC \TMP1, %xmm\index # Round 1 | |
138 | -.endr | |
139 | -.irpc index, \i_seq | |
140 | - movaps 0x20(%arg1), \TMP1 | |
141 | - AESENC \TMP1, %xmm\index # Round 2 | |
142 | + pxor \TMP2, %xmm\index | |
143 | .endr | |
144 | -.irpc index, \i_seq | |
145 | - movaps 0x30(%arg1), \TMP1 | |
146 | - AESENC \TMP1, %xmm\index # Round 2 | |
147 | -.endr | |
148 | -.irpc index, \i_seq | |
149 | - movaps 0x40(%arg1), \TMP1 | |
150 | - AESENC \TMP1, %xmm\index # Round 2 | |
151 | -.endr | |
152 | -.irpc index, \i_seq | |
153 | - movaps 0x50(%arg1), \TMP1 | |
154 | - AESENC \TMP1, %xmm\index # Round 2 | |
155 | -.endr | |
156 | -.irpc index, \i_seq | |
157 | - movaps 0x60(%arg1), \TMP1 | |
158 | - AESENC \TMP1, %xmm\index # Round 2 | |
159 | -.endr | |
160 | -.irpc index, \i_seq | |
161 | - movaps 0x70(%arg1), \TMP1 | |
162 | - AESENC \TMP1, %xmm\index # Round 2 | |
163 | -.endr | |
164 | -.irpc index, \i_seq | |
165 | - movaps 0x80(%arg1), \TMP1 | |
166 | - AESENC \TMP1, %xmm\index # Round 2 | |
167 | -.endr | |
168 | -.irpc index, \i_seq | |
169 | - movaps 0x90(%arg1), \TMP1 | |
170 | - AESENC \TMP1, %xmm\index # Round 2 | |
171 | + lea 0x10(%arg1),%r10 | |
172 | + mov keysize,%eax | |
173 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
174 | + add $5,%eax # 128->9, 192->11, 256->13 | |
175 | + | |
176 | +aes_loop_initial_dec\num_initial_blocks: | |
177 | + MOVADQ (%r10),\TMP1 | |
178 | +.irpc index, \i_seq | |
179 | + AESENC \TMP1, %xmm\index | |
180 | .endr | |
181 | + add $16,%r10 | |
182 | + sub $1,%eax | |
183 | + jnz aes_loop_initial_dec\num_initial_blocks | |
184 | + | |
185 | + MOVADQ (%r10), \TMP1 | |
186 | .irpc index, \i_seq | |
187 | - movaps 0xa0(%arg1), \TMP1 | |
188 | - AESENCLAST \TMP1, %xmm\index # Round 10 | |
189 | + AESENCLAST \TMP1, %xmm\index # Last Round | |
190 | .endr | |
191 | .irpc index, \i_seq | |
192 | movdqu (%arg3 , %r11, 1), \TMP1 | |
193 | @@ -306,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
194 | add $16, %r11 | |
195 | ||
196 | movdqa \TMP1, %xmm\index | |
197 | - movdqa SHUF_MASK(%rip), %xmm14 | |
198 | PSHUFB_XMM %xmm14, %xmm\index | |
199 | - | |
200 | - # prepare plaintext/ciphertext for GHASH computation | |
201 | + # prepare plaintext/ciphertext for GHASH computation | |
202 | .endr | |
203 | .endif | |
204 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | |
205 | @@ -339,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
206 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | |
207 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
208 | */ | |
209 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
210 | - movdqa \XMM0, \XMM1 | |
211 | - movdqa SHUF_MASK(%rip), %xmm14 | |
212 | + MOVADQ ONE(%rip), \TMP1 | |
213 | + paddd \TMP1, \XMM0 # INCR Y0 | |
214 | + MOVADQ \XMM0, \XMM1 | |
215 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | |
216 | ||
217 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
218 | - movdqa \XMM0, \XMM2 | |
219 | - movdqa SHUF_MASK(%rip), %xmm14 | |
220 | + paddd \TMP1, \XMM0 # INCR Y0 | |
221 | + MOVADQ \XMM0, \XMM2 | |
222 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | |
223 | ||
224 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
225 | - movdqa \XMM0, \XMM3 | |
226 | - movdqa SHUF_MASK(%rip), %xmm14 | |
227 | + paddd \TMP1, \XMM0 # INCR Y0 | |
228 | + MOVADQ \XMM0, \XMM3 | |
229 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | |
230 | ||
231 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
232 | - movdqa \XMM0, \XMM4 | |
233 | - movdqa SHUF_MASK(%rip), %xmm14 | |
234 | + paddd \TMP1, \XMM0 # INCR Y0 | |
235 | + MOVADQ \XMM0, \XMM4 | |
236 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | |
237 | ||
238 | - pxor 16*0(%arg1), \XMM1 | |
239 | - pxor 16*0(%arg1), \XMM2 | |
240 | - pxor 16*0(%arg1), \XMM3 | |
241 | - pxor 16*0(%arg1), \XMM4 | |
242 | + MOVADQ 0(%arg1),\TMP1 | |
243 | + pxor \TMP1, \XMM1 | |
244 | + pxor \TMP1, \XMM2 | |
245 | + pxor \TMP1, \XMM3 | |
246 | + pxor \TMP1, \XMM4 | |
247 | movdqa \TMP3, \TMP5 | |
248 | pshufd $78, \TMP3, \TMP1 | |
249 | pxor \TMP3, \TMP1 | |
250 | @@ -400,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
251 | pshufd $78, \TMP5, \TMP1 | |
252 | pxor \TMP5, \TMP1 | |
253 | movdqa \TMP1, HashKey_4_k(%rsp) | |
254 | - movaps 0xa0(%arg1), \TMP2 | |
255 | + lea 0xa0(%arg1),%r10 | |
256 | + mov keysize,%eax | |
257 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
258 | + sub $4,%eax # 128->0, 192->2, 256->4 | |
259 | + jz aes_loop_pre_dec_done\num_initial_blocks | |
260 | + | |
261 | +aes_loop_pre_dec\num_initial_blocks: | |
262 | + MOVADQ (%r10),\TMP2 | |
263 | +.irpc index, 1234 | |
264 | + AESENC \TMP2, %xmm\index | |
265 | +.endr | |
266 | + add $16,%r10 | |
267 | + sub $1,%eax | |
268 | + jnz aes_loop_pre_dec\num_initial_blocks | |
269 | + | |
270 | +aes_loop_pre_dec_done\num_initial_blocks: | |
271 | + MOVADQ (%r10), \TMP2 | |
272 | AESENCLAST \TMP2, \XMM1 | |
273 | AESENCLAST \TMP2, \XMM2 | |
274 | AESENCLAST \TMP2, \XMM3 | |
275 | @@ -422,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
276 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | |
277 | movdqa \TMP1, \XMM4 | |
278 | add $64, %r11 | |
279 | - movdqa SHUF_MASK(%rip), %xmm14 | |
280 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | |
281 | pxor \XMMDst, \XMM1 | |
282 | # combine GHASHed value with the corresponding ciphertext | |
283 | - movdqa SHUF_MASK(%rip), %xmm14 | |
284 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | |
285 | - movdqa SHUF_MASK(%rip), %xmm14 | |
286 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | |
287 | - movdqa SHUF_MASK(%rip), %xmm14 | |
288 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | |
289 | ||
290 | _initial_blocks_done\num_initial_blocks\operation: | |
291 | @@ -452,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation: | |
292 | ||
293 | .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | |
294 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | |
295 | + MOVADQ SHUF_MASK(%rip), %xmm14 | |
296 | mov arg7, %r10 # %r10 = AAD | |
297 | mov arg8, %r15 # %r15 = aadLen | |
298 | mov %r15, %r11 | |
299 | @@ -473,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation: | |
300 | cmp %r11, %r15 | |
301 | jne _get_AAD_loop2\num_initial_blocks\operation | |
302 | _get_AAD_loop2_done\num_initial_blocks\operation: | |
303 | - movdqa SHUF_MASK(%rip), %xmm14 | |
304 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | |
305 | ||
306 | xor %r11, %r11 # initialise the data pointer offset as zero | |
307 | @@ -482,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
308 | ||
309 | mov %arg5, %rax # %rax = *Y0 | |
310 | movdqu (%rax), \XMM0 # XMM0 = Y0 | |
311 | - movdqa SHUF_MASK(%rip), %xmm14 | |
312 | PSHUFB_XMM %xmm14, \XMM0 | |
313 | ||
314 | .if (\i == 5) || (\i == 6) || (\i == 7) | |
315 | -.irpc index, \i_seq | |
316 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
317 | - movdqa \XMM0, %xmm\index | |
318 | - movdqa SHUF_MASK(%rip), %xmm14 | |
319 | - PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | |
320 | ||
321 | -.endr | |
322 | -.irpc index, \i_seq | |
323 | - pxor 16*0(%arg1), %xmm\index | |
324 | -.endr | |
325 | -.irpc index, \i_seq | |
326 | - movaps 0x10(%rdi), \TMP1 | |
327 | - AESENC \TMP1, %xmm\index # Round 1 | |
328 | -.endr | |
329 | -.irpc index, \i_seq | |
330 | - movaps 0x20(%arg1), \TMP1 | |
331 | - AESENC \TMP1, %xmm\index # Round 2 | |
332 | -.endr | |
333 | + MOVADQ ONE(%RIP),\TMP1 | |
334 | + MOVADQ 0(%arg1),\TMP2 | |
335 | .irpc index, \i_seq | |
336 | - movaps 0x30(%arg1), \TMP1 | |
337 | - AESENC \TMP1, %xmm\index # Round 2 | |
338 | + paddd \TMP1, \XMM0 # INCR Y0 | |
339 | + MOVADQ \XMM0, %xmm\index | |
340 | + PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | |
341 | + pxor \TMP2, %xmm\index | |
342 | .endr | |
343 | -.irpc index, \i_seq | |
344 | - movaps 0x40(%arg1), \TMP1 | |
345 | - AESENC \TMP1, %xmm\index # Round 2 | |
346 | -.endr | |
347 | -.irpc index, \i_seq | |
348 | - movaps 0x50(%arg1), \TMP1 | |
349 | - AESENC \TMP1, %xmm\index # Round 2 | |
350 | -.endr | |
351 | -.irpc index, \i_seq | |
352 | - movaps 0x60(%arg1), \TMP1 | |
353 | - AESENC \TMP1, %xmm\index # Round 2 | |
354 | -.endr | |
355 | -.irpc index, \i_seq | |
356 | - movaps 0x70(%arg1), \TMP1 | |
357 | - AESENC \TMP1, %xmm\index # Round 2 | |
358 | -.endr | |
359 | -.irpc index, \i_seq | |
360 | - movaps 0x80(%arg1), \TMP1 | |
361 | - AESENC \TMP1, %xmm\index # Round 2 | |
362 | -.endr | |
363 | -.irpc index, \i_seq | |
364 | - movaps 0x90(%arg1), \TMP1 | |
365 | - AESENC \TMP1, %xmm\index # Round 2 | |
366 | + lea 0x10(%arg1),%r10 | |
367 | + mov keysize,%eax | |
368 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
369 | + add $5,%eax # 128->9, 192->11, 256->13 | |
370 | + | |
371 | +aes_loop_initial_enc\num_initial_blocks: | |
372 | + MOVADQ (%r10),\TMP1 | |
373 | +.irpc index, \i_seq | |
374 | + AESENC \TMP1, %xmm\index | |
375 | .endr | |
376 | + add $16,%r10 | |
377 | + sub $1,%eax | |
378 | + jnz aes_loop_initial_enc\num_initial_blocks | |
379 | + | |
380 | + MOVADQ (%r10), \TMP1 | |
381 | .irpc index, \i_seq | |
382 | - movaps 0xa0(%arg1), \TMP1 | |
383 | - AESENCLAST \TMP1, %xmm\index # Round 10 | |
384 | + AESENCLAST \TMP1, %xmm\index # Last Round | |
385 | .endr | |
386 | .irpc index, \i_seq | |
387 | movdqu (%arg3 , %r11, 1), \TMP1 | |
388 | @@ -542,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
389 | movdqu %xmm\index, (%arg2 , %r11, 1) | |
390 | # write back plaintext/ciphertext for num_initial_blocks | |
391 | add $16, %r11 | |
392 | - | |
393 | - movdqa SHUF_MASK(%rip), %xmm14 | |
394 | PSHUFB_XMM %xmm14, %xmm\index | |
395 | ||
396 | # prepare plaintext/ciphertext for GHASH computation | |
397 | @@ -576,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
398 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | |
399 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | |
400 | */ | |
401 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
402 | - movdqa \XMM0, \XMM1 | |
403 | - movdqa SHUF_MASK(%rip), %xmm14 | |
404 | + MOVADQ ONE(%RIP),\TMP1 | |
405 | + paddd \TMP1, \XMM0 # INCR Y0 | |
406 | + MOVADQ \XMM0, \XMM1 | |
407 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | |
408 | ||
409 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
410 | - movdqa \XMM0, \XMM2 | |
411 | - movdqa SHUF_MASK(%rip), %xmm14 | |
412 | + paddd \TMP1, \XMM0 # INCR Y0 | |
413 | + MOVADQ \XMM0, \XMM2 | |
414 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | |
415 | ||
416 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
417 | - movdqa \XMM0, \XMM3 | |
418 | - movdqa SHUF_MASK(%rip), %xmm14 | |
419 | + paddd \TMP1, \XMM0 # INCR Y0 | |
420 | + MOVADQ \XMM0, \XMM3 | |
421 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | |
422 | ||
423 | - paddd ONE(%rip), \XMM0 # INCR Y0 | |
424 | - movdqa \XMM0, \XMM4 | |
425 | - movdqa SHUF_MASK(%rip), %xmm14 | |
426 | + paddd \TMP1, \XMM0 # INCR Y0 | |
427 | + MOVADQ \XMM0, \XMM4 | |
428 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | |
429 | ||
430 | - pxor 16*0(%arg1), \XMM1 | |
431 | - pxor 16*0(%arg1), \XMM2 | |
432 | - pxor 16*0(%arg1), \XMM3 | |
433 | - pxor 16*0(%arg1), \XMM4 | |
434 | + MOVADQ 0(%arg1),\TMP1 | |
435 | + pxor \TMP1, \XMM1 | |
436 | + pxor \TMP1, \XMM2 | |
437 | + pxor \TMP1, \XMM3 | |
438 | + pxor \TMP1, \XMM4 | |
439 | movdqa \TMP3, \TMP5 | |
440 | pshufd $78, \TMP3, \TMP1 | |
441 | pxor \TMP3, \TMP1 | |
442 | @@ -637,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
443 | pshufd $78, \TMP5, \TMP1 | |
444 | pxor \TMP5, \TMP1 | |
445 | movdqa \TMP1, HashKey_4_k(%rsp) | |
446 | - movaps 0xa0(%arg1), \TMP2 | |
447 | + lea 0xa0(%arg1),%r10 | |
448 | + mov keysize,%eax | |
449 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
450 | + sub $4,%eax # 128->0, 192->2, 256->4 | |
451 | + jz aes_loop_pre_enc_done\num_initial_blocks | |
452 | + | |
453 | +aes_loop_pre_enc\num_initial_blocks: | |
454 | + MOVADQ (%r10),\TMP2 | |
455 | +.irpc index, 1234 | |
456 | + AESENC \TMP2, %xmm\index | |
457 | +.endr | |
458 | + add $16,%r10 | |
459 | + sub $1,%eax | |
460 | + jnz aes_loop_pre_enc\num_initial_blocks | |
461 | + | |
462 | +aes_loop_pre_enc_done\num_initial_blocks: | |
463 | + MOVADQ (%r10), \TMP2 | |
464 | AESENCLAST \TMP2, \XMM1 | |
465 | AESENCLAST \TMP2, \XMM2 | |
466 | AESENCLAST \TMP2, \XMM3 | |
467 | @@ -656,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: | |
468 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | |
469 | ||
470 | add $64, %r11 | |
471 | - movdqa SHUF_MASK(%rip), %xmm14 | |
472 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | |
473 | pxor \XMMDst, \XMM1 | |
474 | # combine GHASHed value with the corresponding ciphertext | |
475 | - movdqa SHUF_MASK(%rip), %xmm14 | |
476 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | |
477 | - movdqa SHUF_MASK(%rip), %xmm14 | |
478 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | |
479 | - movdqa SHUF_MASK(%rip), %xmm14 | |
480 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | |
481 | ||
482 | _initial_blocks_done\num_initial_blocks\operation: | |
483 | @@ -795,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | |
484 | AESENC \TMP3, \XMM3 | |
485 | AESENC \TMP3, \XMM4 | |
486 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | |
487 | - movaps 0xa0(%arg1), \TMP3 | |
488 | + lea 0xa0(%arg1),%r10 | |
489 | + mov keysize,%eax | |
490 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
491 | + sub $4,%eax # 128->0, 192->2, 256->4 | |
492 | + jz aes_loop_par_enc_done | |
493 | + | |
494 | +aes_loop_par_enc: | |
495 | + MOVADQ (%r10),\TMP3 | |
496 | +.irpc index, 1234 | |
497 | + AESENC \TMP3, %xmm\index | |
498 | +.endr | |
499 | + add $16,%r10 | |
500 | + sub $1,%eax | |
501 | + jnz aes_loop_par_enc | |
502 | + | |
503 | +aes_loop_par_enc_done: | |
504 | + MOVADQ (%r10), \TMP3 | |
505 | AESENCLAST \TMP3, \XMM1 # Round 10 | |
506 | AESENCLAST \TMP3, \XMM2 | |
507 | AESENCLAST \TMP3, \XMM3 | |
508 | @@ -987,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | |
509 | AESENC \TMP3, \XMM3 | |
510 | AESENC \TMP3, \XMM4 | |
511 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | |
512 | - movaps 0xa0(%arg1), \TMP3 | |
513 | - AESENCLAST \TMP3, \XMM1 # Round 10 | |
514 | + lea 0xa0(%arg1),%r10 | |
515 | + mov keysize,%eax | |
516 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
517 | + sub $4,%eax # 128->0, 192->2, 256->4 | |
518 | + jz aes_loop_par_dec_done | |
519 | + | |
520 | +aes_loop_par_dec: | |
521 | + MOVADQ (%r10),\TMP3 | |
522 | +.irpc index, 1234 | |
523 | + AESENC \TMP3, %xmm\index | |
524 | +.endr | |
525 | + add $16,%r10 | |
526 | + sub $1,%eax | |
527 | + jnz aes_loop_par_dec | |
528 | + | |
529 | +aes_loop_par_dec_done: | |
530 | + MOVADQ (%r10), \TMP3 | |
531 | + AESENCLAST \TMP3, \XMM1 # last round | |
532 | AESENCLAST \TMP3, \XMM2 | |
533 | AESENCLAST \TMP3, \XMM3 | |
534 | AESENCLAST \TMP3, \XMM4 | |
535 | @@ -1156,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | |
536 | pxor \TMP6, \XMMDst # reduced result is in XMMDst | |
537 | .endm | |
538 | ||
539 | -/* Encryption of a single block done*/ | |
540 | -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | |
541 | ||
542 | - pxor (%arg1), \XMM0 | |
543 | - movaps 16(%arg1), \TMP1 | |
544 | - AESENC \TMP1, \XMM0 | |
545 | - movaps 32(%arg1), \TMP1 | |
546 | - AESENC \TMP1, \XMM0 | |
547 | - movaps 48(%arg1), \TMP1 | |
548 | - AESENC \TMP1, \XMM0 | |
549 | - movaps 64(%arg1), \TMP1 | |
550 | - AESENC \TMP1, \XMM0 | |
551 | - movaps 80(%arg1), \TMP1 | |
552 | - AESENC \TMP1, \XMM0 | |
553 | - movaps 96(%arg1), \TMP1 | |
554 | - AESENC \TMP1, \XMM0 | |
555 | - movaps 112(%arg1), \TMP1 | |
556 | - AESENC \TMP1, \XMM0 | |
557 | - movaps 128(%arg1), \TMP1 | |
558 | - AESENC \TMP1, \XMM0 | |
559 | - movaps 144(%arg1), \TMP1 | |
560 | - AESENC \TMP1, \XMM0 | |
561 | - movaps 160(%arg1), \TMP1 | |
562 | - AESENCLAST \TMP1, \XMM0 | |
563 | -.endm | |
564 | +/* Encryption of a single block | |
565 | +* uses eax & r10 | |
566 | +*/ | |
567 | ||
568 | +.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | |
569 | ||
570 | + pxor (%arg1), \XMM0 | |
571 | + mov keysize,%eax | |
572 | + shr $2,%eax # 128->4, 192->6, 256->8 | |
573 | + add $5,%eax # 128->9, 192->11, 256->13 | |
574 | + lea 16(%arg1), %r10 # get first expanded key address | |
575 | + | |
576 | +_esb_loop_\@: | |
577 | + MOVADQ (%r10),\TMP1 | |
578 | + AESENC \TMP1,\XMM0 | |
579 | + add $16,%r10 | |
580 | + sub $1,%eax | |
581 | + jnz _esb_loop_\@ | |
582 | + | |
583 | + MOVADQ (%r10),\TMP1 | |
584 | + AESENCLAST \TMP1,\XMM0 | |
585 | +.endm | |
586 | /***************************************************************************** | |
587 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | |
588 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. | |
589 | diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c | |
590 | index 6d4faba..bfaf817 100644 | |
591 | --- a/arch/x86/crypto/aesni-intel_glue.c | |
592 | +++ b/arch/x86/crypto/aesni-intel_glue.c | |
593 | @@ -177,7 +177,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out, | |
594 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | |
595 | u8 *auth_tag, unsigned long auth_tag_len) | |
596 | { | |
597 | - if (plaintext_len < AVX_GEN2_OPTSIZE) { | |
598 | + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; | |
599 | + if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){ | |
600 | aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, | |
601 | aad_len, auth_tag, auth_tag_len); | |
602 | } else { | |
603 | @@ -192,7 +193,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out, | |
604 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | |
605 | u8 *auth_tag, unsigned long auth_tag_len) | |
606 | { | |
607 | - if (ciphertext_len < AVX_GEN2_OPTSIZE) { | |
608 | + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; | |
609 | + if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { | |
610 | aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, | |
611 | aad_len, auth_tag, auth_tag_len); | |
612 | } else { | |
613 | @@ -226,7 +228,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out, | |
614 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | |
615 | u8 *auth_tag, unsigned long auth_tag_len) | |
616 | { | |
617 | - if (plaintext_len < AVX_GEN2_OPTSIZE) { | |
618 | + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; | |
619 | + if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { | |
620 | aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, | |
621 | aad_len, auth_tag, auth_tag_len); | |
622 | } else if (plaintext_len < AVX_GEN4_OPTSIZE) { | |
623 | @@ -245,7 +248,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out, | |
624 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | |
625 | u8 *auth_tag, unsigned long auth_tag_len) | |
626 | { | |
627 | - if (ciphertext_len < AVX_GEN2_OPTSIZE) { | |
628 | + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; | |
629 | + if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { | |
630 | aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, | |
631 | aad, aad_len, auth_tag, auth_tag_len); | |
632 | } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { | |
633 | @@ -878,7 +882,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, | |
634 | } | |
635 | /*Account for 4 byte nonce at the end.*/ | |
636 | key_len -= 4; | |
637 | - if (key_len != AES_KEYSIZE_128) { | |
638 | + if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && | |
639 | + key_len != AES_KEYSIZE_256) { | |
640 | crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | |
641 | return -EINVAL; | |
642 | } | |
643 | @@ -989,6 +994,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) | |
644 | __be32 counter = cpu_to_be32(1); | |
645 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | |
646 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | |
647 | + u32 key_len = ctx->aes_key_expanded.key_length; | |
648 | void *aes_ctx = &(ctx->aes_key_expanded); | |
649 | unsigned long auth_tag_len = crypto_aead_authsize(tfm); | |
650 | u8 iv_tab[16+AESNI_ALIGN]; | |
651 | @@ -1003,6 +1009,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) | |
652 | /* to 8 or 12 bytes */ | |
653 | if (unlikely(req->assoclen != 8 && req->assoclen != 12)) | |
654 | return -EINVAL; | |
655 | + if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) | |
656 | + return -EINVAL; | |
657 | + if (unlikely(key_len != AES_KEYSIZE_128 && | |
658 | + key_len != AES_KEYSIZE_192 && | |
659 | + key_len != AES_KEYSIZE_256)) | |
660 | + return -EINVAL; | |
661 | + | |
662 | /* IV below built */ | |
663 | for (i = 0; i < 4; i++) | |
664 | *(iv+i) = ctx->nonce[i]; | |
665 | @@ -1067,6 +1080,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) | |
666 | int retval = 0; | |
667 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | |
668 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | |
669 | + u32 key_len = ctx->aes_key_expanded.key_length; | |
670 | void *aes_ctx = &(ctx->aes_key_expanded); | |
671 | unsigned long auth_tag_len = crypto_aead_authsize(tfm); | |
672 | u8 iv_and_authTag[32+AESNI_ALIGN]; | |
673 | @@ -1080,6 +1094,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) | |
674 | if (unlikely((req->cryptlen < auth_tag_len) || | |
675 | (req->assoclen != 8 && req->assoclen != 12))) | |
676 | return -EINVAL; | |
677 | + if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) | |
678 | + return -EINVAL; | |
679 | + if (unlikely(key_len != AES_KEYSIZE_128 && | |
680 | + key_len != AES_KEYSIZE_192 && | |
681 | + key_len != AES_KEYSIZE_256)) | |
682 | + return -EINVAL; | |
683 | + | |
684 | /* Assuming we are supporting rfc4106 64-bit extended */ | |
685 | /* sequence numbers We need to have the AAD length */ | |
686 | /* equal to 8 or 12 bytes */ | |
687 | -- | |
688 | 2.7.4 | |
689 |