]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/linux/0100-crypto-aesni-Add-support-for-192-256-bit-keys-to-AES.patch
util-linux: update rootfile (armv5tel)
[ipfire-2.x.git] / src / patches / linux / 0100-crypto-aesni-Add-support-for-192-256-bit-keys-to-AES.patch
CommitLineData
ed7a7f77
AF
1From bcdbd313c0e6fd630a8945fd58dc5383631dc6dd Mon Sep 17 00:00:00 2001
2From: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
3Date: Tue, 13 Jan 2015 13:16:43 -0500
4Subject: [PATCH] crypto: aesni - Add support for 192 & 256 bit keys to AESNI
5 RFC4106
6
7These patches fix the RFC4106 implementation in the aesni-intel
8module so it supports 192 & 256 bit keys.
9
10Since the AVX support that was added to this module also only
11supports 128 bit keys, and this patch only affects the SSE
12implementation, changes were also made to use the SSE version
13if key sizes other than 128 are specified.
14
15RFC4106 specifies that 192 & 256 bit keys must be supported (section
168.4).
17
18Also, this should fix Strongswan issue 341 where the aesni module
19needs to be unloaded if 256 bit keys are used:
20
21http://wiki.strongswan.org/issues/341
22
23This patch has been tested with Sandy Bridge and Haswell processors.
24With 128 bit keys and input buffers > 512 bytes a slight performance
25degradation was noticed (~1%). For input buffers of less than 512
26bytes there was no performance impact. Compared to 128 bit keys,
27256 bit key size performance is approx. .5 cycles per byte slower
28on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs.
29SSE code).
30
31This patch has also been tested with StrongSwan IPSec connections
32where it worked correctly.
33
34I created this diff from a git clone of crypto-2.6.git.
35
36Any questions, please feel free to contact me.
37
38Signed-off-by: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
39Signed-off-by: Jarod Wilson <jarod@redhat.com>
40Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
41---
42 arch/x86/crypto/aesni-intel_asm.S | 342 +++++++++++++++++++------------------
43 arch/x86/crypto/aesni-intel_glue.c | 31 +++-
44 2 files changed, 202 insertions(+), 171 deletions(-)
45
46diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
47index c92c7d8..f5cdfbf 100644
48--- a/arch/x86/crypto/aesni-intel_asm.S
49+++ b/arch/x86/crypto/aesni-intel_asm.S
50@@ -33,12 +33,23 @@
51 #include <asm/inst.h>
52 #include <asm/alternative-asm.h>
53
54+/*
55+ * The following macros are used to move an (un)aligned 16 byte value to/from
56+ * an XMM register. This can done for either FP or integer values, for FP use
57+ * movaps (move aligned packed single) or integer use movdqa (move double quad
58+ * aligned). It doesn't make a performance difference which instruction is used
59+ * since Nehalem (original Core i7) was released. However, the movaps is a byte
60+ * shorter, so that is the one we'll use for now. (same for unaligned).
61+ */
62+#define MOVADQ movaps
63+#define MOVUDQ movups
64+
65 #ifdef __x86_64__
66+
67 .data
68 .align 16
69 .Lgf128mul_x_ble_mask:
70 .octa 0x00000000000000010000000000000087
71-
72 POLY: .octa 0xC2000000000000000000000000000001
73 TWOONE: .octa 0x00000001000000000000000000000001
74
75@@ -90,6 +101,7 @@ enc: .octa 0x2
76 #define arg8 STACK_OFFSET+16(%r14)
77 #define arg9 STACK_OFFSET+24(%r14)
78 #define arg10 STACK_OFFSET+32(%r14)
79+#define keysize 2*15*16(%arg1)
80 #endif
81
82
83@@ -214,10 +226,12 @@ enc: .octa 0x2
84
85 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
86 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
87+ MOVADQ SHUF_MASK(%rip), %xmm14
88 mov arg7, %r10 # %r10 = AAD
89 mov arg8, %r15 # %r15 = aadLen
90 mov %r15, %r11
91 pxor %xmm\i, %xmm\i
92+
93 _get_AAD_loop\num_initial_blocks\operation:
94 movd (%r10), \TMP1
95 pslldq $12, \TMP1
96@@ -226,6 +240,7 @@ _get_AAD_loop\num_initial_blocks\operation:
97 add $4, %r10
98 sub $4, %r15
99 jne _get_AAD_loop\num_initial_blocks\operation
100+
101 cmp $16, %r11
102 je _get_AAD_loop2_done\num_initial_blocks\operation
103 mov $16, %r15
104@@ -234,8 +249,8 @@ _get_AAD_loop2\num_initial_blocks\operation:
105 sub $4, %r15
106 cmp %r11, %r15
107 jne _get_AAD_loop2\num_initial_blocks\operation
108+
109 _get_AAD_loop2_done\num_initial_blocks\operation:
110- movdqa SHUF_MASK(%rip), %xmm14
111 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
112
113 xor %r11, %r11 # initialise the data pointer offset as zero
114@@ -244,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
115
116 mov %arg5, %rax # %rax = *Y0
117 movdqu (%rax), \XMM0 # XMM0 = Y0
118- movdqa SHUF_MASK(%rip), %xmm14
119 PSHUFB_XMM %xmm14, \XMM0
120
121 .if (\i == 5) || (\i == 6) || (\i == 7)
122+ MOVADQ ONE(%RIP),\TMP1
123+ MOVADQ (%arg1),\TMP2
124 .irpc index, \i_seq
125- paddd ONE(%rip), \XMM0 # INCR Y0
126+ paddd \TMP1, \XMM0 # INCR Y0
127 movdqa \XMM0, %xmm\index
128- movdqa SHUF_MASK(%rip), %xmm14
129 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
130-
131-.endr
132-.irpc index, \i_seq
133- pxor 16*0(%arg1), %xmm\index
134-.endr
135-.irpc index, \i_seq
136- movaps 0x10(%rdi), \TMP1
137- AESENC \TMP1, %xmm\index # Round 1
138-.endr
139-.irpc index, \i_seq
140- movaps 0x20(%arg1), \TMP1
141- AESENC \TMP1, %xmm\index # Round 2
142+ pxor \TMP2, %xmm\index
143 .endr
144-.irpc index, \i_seq
145- movaps 0x30(%arg1), \TMP1
146- AESENC \TMP1, %xmm\index # Round 2
147-.endr
148-.irpc index, \i_seq
149- movaps 0x40(%arg1), \TMP1
150- AESENC \TMP1, %xmm\index # Round 2
151-.endr
152-.irpc index, \i_seq
153- movaps 0x50(%arg1), \TMP1
154- AESENC \TMP1, %xmm\index # Round 2
155-.endr
156-.irpc index, \i_seq
157- movaps 0x60(%arg1), \TMP1
158- AESENC \TMP1, %xmm\index # Round 2
159-.endr
160-.irpc index, \i_seq
161- movaps 0x70(%arg1), \TMP1
162- AESENC \TMP1, %xmm\index # Round 2
163-.endr
164-.irpc index, \i_seq
165- movaps 0x80(%arg1), \TMP1
166- AESENC \TMP1, %xmm\index # Round 2
167-.endr
168-.irpc index, \i_seq
169- movaps 0x90(%arg1), \TMP1
170- AESENC \TMP1, %xmm\index # Round 2
171+ lea 0x10(%arg1),%r10
172+ mov keysize,%eax
173+ shr $2,%eax # 128->4, 192->6, 256->8
174+ add $5,%eax # 128->9, 192->11, 256->13
175+
176+aes_loop_initial_dec\num_initial_blocks:
177+ MOVADQ (%r10),\TMP1
178+.irpc index, \i_seq
179+ AESENC \TMP1, %xmm\index
180 .endr
181+ add $16,%r10
182+ sub $1,%eax
183+ jnz aes_loop_initial_dec\num_initial_blocks
184+
185+ MOVADQ (%r10), \TMP1
186 .irpc index, \i_seq
187- movaps 0xa0(%arg1), \TMP1
188- AESENCLAST \TMP1, %xmm\index # Round 10
189+ AESENCLAST \TMP1, %xmm\index # Last Round
190 .endr
191 .irpc index, \i_seq
192 movdqu (%arg3 , %r11, 1), \TMP1
193@@ -306,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
194 add $16, %r11
195
196 movdqa \TMP1, %xmm\index
197- movdqa SHUF_MASK(%rip), %xmm14
198 PSHUFB_XMM %xmm14, %xmm\index
199-
200- # prepare plaintext/ciphertext for GHASH computation
201+ # prepare plaintext/ciphertext for GHASH computation
202 .endr
203 .endif
204 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
205@@ -339,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
206 * Precomputations for HashKey parallel with encryption of first 4 blocks.
207 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
208 */
209- paddd ONE(%rip), \XMM0 # INCR Y0
210- movdqa \XMM0, \XMM1
211- movdqa SHUF_MASK(%rip), %xmm14
212+ MOVADQ ONE(%rip), \TMP1
213+ paddd \TMP1, \XMM0 # INCR Y0
214+ MOVADQ \XMM0, \XMM1
215 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
216
217- paddd ONE(%rip), \XMM0 # INCR Y0
218- movdqa \XMM0, \XMM2
219- movdqa SHUF_MASK(%rip), %xmm14
220+ paddd \TMP1, \XMM0 # INCR Y0
221+ MOVADQ \XMM0, \XMM2
222 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
223
224- paddd ONE(%rip), \XMM0 # INCR Y0
225- movdqa \XMM0, \XMM3
226- movdqa SHUF_MASK(%rip), %xmm14
227+ paddd \TMP1, \XMM0 # INCR Y0
228+ MOVADQ \XMM0, \XMM3
229 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
230
231- paddd ONE(%rip), \XMM0 # INCR Y0
232- movdqa \XMM0, \XMM4
233- movdqa SHUF_MASK(%rip), %xmm14
234+ paddd \TMP1, \XMM0 # INCR Y0
235+ MOVADQ \XMM0, \XMM4
236 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
237
238- pxor 16*0(%arg1), \XMM1
239- pxor 16*0(%arg1), \XMM2
240- pxor 16*0(%arg1), \XMM3
241- pxor 16*0(%arg1), \XMM4
242+ MOVADQ 0(%arg1),\TMP1
243+ pxor \TMP1, \XMM1
244+ pxor \TMP1, \XMM2
245+ pxor \TMP1, \XMM3
246+ pxor \TMP1, \XMM4
247 movdqa \TMP3, \TMP5
248 pshufd $78, \TMP3, \TMP1
249 pxor \TMP3, \TMP1
250@@ -400,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
251 pshufd $78, \TMP5, \TMP1
252 pxor \TMP5, \TMP1
253 movdqa \TMP1, HashKey_4_k(%rsp)
254- movaps 0xa0(%arg1), \TMP2
255+ lea 0xa0(%arg1),%r10
256+ mov keysize,%eax
257+ shr $2,%eax # 128->4, 192->6, 256->8
258+ sub $4,%eax # 128->0, 192->2, 256->4
259+ jz aes_loop_pre_dec_done\num_initial_blocks
260+
261+aes_loop_pre_dec\num_initial_blocks:
262+ MOVADQ (%r10),\TMP2
263+.irpc index, 1234
264+ AESENC \TMP2, %xmm\index
265+.endr
266+ add $16,%r10
267+ sub $1,%eax
268+ jnz aes_loop_pre_dec\num_initial_blocks
269+
270+aes_loop_pre_dec_done\num_initial_blocks:
271+ MOVADQ (%r10), \TMP2
272 AESENCLAST \TMP2, \XMM1
273 AESENCLAST \TMP2, \XMM2
274 AESENCLAST \TMP2, \XMM3
275@@ -422,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
276 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
277 movdqa \TMP1, \XMM4
278 add $64, %r11
279- movdqa SHUF_MASK(%rip), %xmm14
280 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
281 pxor \XMMDst, \XMM1
282 # combine GHASHed value with the corresponding ciphertext
283- movdqa SHUF_MASK(%rip), %xmm14
284 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
285- movdqa SHUF_MASK(%rip), %xmm14
286 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
287- movdqa SHUF_MASK(%rip), %xmm14
288 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
289
290 _initial_blocks_done\num_initial_blocks\operation:
291@@ -452,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:
292
293 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
294 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
295+ MOVADQ SHUF_MASK(%rip), %xmm14
296 mov arg7, %r10 # %r10 = AAD
297 mov arg8, %r15 # %r15 = aadLen
298 mov %r15, %r11
299@@ -473,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
300 cmp %r11, %r15
301 jne _get_AAD_loop2\num_initial_blocks\operation
302 _get_AAD_loop2_done\num_initial_blocks\operation:
303- movdqa SHUF_MASK(%rip), %xmm14
304 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
305
306 xor %r11, %r11 # initialise the data pointer offset as zero
307@@ -482,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
308
309 mov %arg5, %rax # %rax = *Y0
310 movdqu (%rax), \XMM0 # XMM0 = Y0
311- movdqa SHUF_MASK(%rip), %xmm14
312 PSHUFB_XMM %xmm14, \XMM0
313
314 .if (\i == 5) || (\i == 6) || (\i == 7)
315-.irpc index, \i_seq
316- paddd ONE(%rip), \XMM0 # INCR Y0
317- movdqa \XMM0, %xmm\index
318- movdqa SHUF_MASK(%rip), %xmm14
319- PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
320
321-.endr
322-.irpc index, \i_seq
323- pxor 16*0(%arg1), %xmm\index
324-.endr
325-.irpc index, \i_seq
326- movaps 0x10(%rdi), \TMP1
327- AESENC \TMP1, %xmm\index # Round 1
328-.endr
329-.irpc index, \i_seq
330- movaps 0x20(%arg1), \TMP1
331- AESENC \TMP1, %xmm\index # Round 2
332-.endr
333+ MOVADQ ONE(%RIP),\TMP1
334+ MOVADQ 0(%arg1),\TMP2
335 .irpc index, \i_seq
336- movaps 0x30(%arg1), \TMP1
337- AESENC \TMP1, %xmm\index # Round 2
338+ paddd \TMP1, \XMM0 # INCR Y0
339+ MOVADQ \XMM0, %xmm\index
340+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
341+ pxor \TMP2, %xmm\index
342 .endr
343-.irpc index, \i_seq
344- movaps 0x40(%arg1), \TMP1
345- AESENC \TMP1, %xmm\index # Round 2
346-.endr
347-.irpc index, \i_seq
348- movaps 0x50(%arg1), \TMP1
349- AESENC \TMP1, %xmm\index # Round 2
350-.endr
351-.irpc index, \i_seq
352- movaps 0x60(%arg1), \TMP1
353- AESENC \TMP1, %xmm\index # Round 2
354-.endr
355-.irpc index, \i_seq
356- movaps 0x70(%arg1), \TMP1
357- AESENC \TMP1, %xmm\index # Round 2
358-.endr
359-.irpc index, \i_seq
360- movaps 0x80(%arg1), \TMP1
361- AESENC \TMP1, %xmm\index # Round 2
362-.endr
363-.irpc index, \i_seq
364- movaps 0x90(%arg1), \TMP1
365- AESENC \TMP1, %xmm\index # Round 2
366+ lea 0x10(%arg1),%r10
367+ mov keysize,%eax
368+ shr $2,%eax # 128->4, 192->6, 256->8
369+ add $5,%eax # 128->9, 192->11, 256->13
370+
371+aes_loop_initial_enc\num_initial_blocks:
372+ MOVADQ (%r10),\TMP1
373+.irpc index, \i_seq
374+ AESENC \TMP1, %xmm\index
375 .endr
376+ add $16,%r10
377+ sub $1,%eax
378+ jnz aes_loop_initial_enc\num_initial_blocks
379+
380+ MOVADQ (%r10), \TMP1
381 .irpc index, \i_seq
382- movaps 0xa0(%arg1), \TMP1
383- AESENCLAST \TMP1, %xmm\index # Round 10
384+ AESENCLAST \TMP1, %xmm\index # Last Round
385 .endr
386 .irpc index, \i_seq
387 movdqu (%arg3 , %r11, 1), \TMP1
388@@ -542,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
389 movdqu %xmm\index, (%arg2 , %r11, 1)
390 # write back plaintext/ciphertext for num_initial_blocks
391 add $16, %r11
392-
393- movdqa SHUF_MASK(%rip), %xmm14
394 PSHUFB_XMM %xmm14, %xmm\index
395
396 # prepare plaintext/ciphertext for GHASH computation
397@@ -576,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
398 * Precomputations for HashKey parallel with encryption of first 4 blocks.
399 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
400 */
401- paddd ONE(%rip), \XMM0 # INCR Y0
402- movdqa \XMM0, \XMM1
403- movdqa SHUF_MASK(%rip), %xmm14
404+ MOVADQ ONE(%RIP),\TMP1
405+ paddd \TMP1, \XMM0 # INCR Y0
406+ MOVADQ \XMM0, \XMM1
407 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
408
409- paddd ONE(%rip), \XMM0 # INCR Y0
410- movdqa \XMM0, \XMM2
411- movdqa SHUF_MASK(%rip), %xmm14
412+ paddd \TMP1, \XMM0 # INCR Y0
413+ MOVADQ \XMM0, \XMM2
414 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
415
416- paddd ONE(%rip), \XMM0 # INCR Y0
417- movdqa \XMM0, \XMM3
418- movdqa SHUF_MASK(%rip), %xmm14
419+ paddd \TMP1, \XMM0 # INCR Y0
420+ MOVADQ \XMM0, \XMM3
421 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
422
423- paddd ONE(%rip), \XMM0 # INCR Y0
424- movdqa \XMM0, \XMM4
425- movdqa SHUF_MASK(%rip), %xmm14
426+ paddd \TMP1, \XMM0 # INCR Y0
427+ MOVADQ \XMM0, \XMM4
428 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
429
430- pxor 16*0(%arg1), \XMM1
431- pxor 16*0(%arg1), \XMM2
432- pxor 16*0(%arg1), \XMM3
433- pxor 16*0(%arg1), \XMM4
434+ MOVADQ 0(%arg1),\TMP1
435+ pxor \TMP1, \XMM1
436+ pxor \TMP1, \XMM2
437+ pxor \TMP1, \XMM3
438+ pxor \TMP1, \XMM4
439 movdqa \TMP3, \TMP5
440 pshufd $78, \TMP3, \TMP1
441 pxor \TMP3, \TMP1
442@@ -637,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
443 pshufd $78, \TMP5, \TMP1
444 pxor \TMP5, \TMP1
445 movdqa \TMP1, HashKey_4_k(%rsp)
446- movaps 0xa0(%arg1), \TMP2
447+ lea 0xa0(%arg1),%r10
448+ mov keysize,%eax
449+ shr $2,%eax # 128->4, 192->6, 256->8
450+ sub $4,%eax # 128->0, 192->2, 256->4
451+ jz aes_loop_pre_enc_done\num_initial_blocks
452+
453+aes_loop_pre_enc\num_initial_blocks:
454+ MOVADQ (%r10),\TMP2
455+.irpc index, 1234
456+ AESENC \TMP2, %xmm\index
457+.endr
458+ add $16,%r10
459+ sub $1,%eax
460+ jnz aes_loop_pre_enc\num_initial_blocks
461+
462+aes_loop_pre_enc_done\num_initial_blocks:
463+ MOVADQ (%r10), \TMP2
464 AESENCLAST \TMP2, \XMM1
465 AESENCLAST \TMP2, \XMM2
466 AESENCLAST \TMP2, \XMM3
467@@ -656,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
468 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
469
470 add $64, %r11
471- movdqa SHUF_MASK(%rip), %xmm14
472 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
473 pxor \XMMDst, \XMM1
474 # combine GHASHed value with the corresponding ciphertext
475- movdqa SHUF_MASK(%rip), %xmm14
476 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
477- movdqa SHUF_MASK(%rip), %xmm14
478 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
479- movdqa SHUF_MASK(%rip), %xmm14
480 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
481
482 _initial_blocks_done\num_initial_blocks\operation:
483@@ -795,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
484 AESENC \TMP3, \XMM3
485 AESENC \TMP3, \XMM4
486 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
487- movaps 0xa0(%arg1), \TMP3
488+ lea 0xa0(%arg1),%r10
489+ mov keysize,%eax
490+ shr $2,%eax # 128->4, 192->6, 256->8
491+ sub $4,%eax # 128->0, 192->2, 256->4
492+ jz aes_loop_par_enc_done
493+
494+aes_loop_par_enc:
495+ MOVADQ (%r10),\TMP3
496+.irpc index, 1234
497+ AESENC \TMP3, %xmm\index
498+.endr
499+ add $16,%r10
500+ sub $1,%eax
501+ jnz aes_loop_par_enc
502+
503+aes_loop_par_enc_done:
504+ MOVADQ (%r10), \TMP3
505 AESENCLAST \TMP3, \XMM1 # Round 10
506 AESENCLAST \TMP3, \XMM2
507 AESENCLAST \TMP3, \XMM3
508@@ -987,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
509 AESENC \TMP3, \XMM3
510 AESENC \TMP3, \XMM4
511 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
512- movaps 0xa0(%arg1), \TMP3
513- AESENCLAST \TMP3, \XMM1 # Round 10
514+ lea 0xa0(%arg1),%r10
515+ mov keysize,%eax
516+ shr $2,%eax # 128->4, 192->6, 256->8
517+ sub $4,%eax # 128->0, 192->2, 256->4
518+ jz aes_loop_par_dec_done
519+
520+aes_loop_par_dec:
521+ MOVADQ (%r10),\TMP3
522+.irpc index, 1234
523+ AESENC \TMP3, %xmm\index
524+.endr
525+ add $16,%r10
526+ sub $1,%eax
527+ jnz aes_loop_par_dec
528+
529+aes_loop_par_dec_done:
530+ MOVADQ (%r10), \TMP3
531+ AESENCLAST \TMP3, \XMM1 # last round
532 AESENCLAST \TMP3, \XMM2
533 AESENCLAST \TMP3, \XMM3
534 AESENCLAST \TMP3, \XMM4
535@@ -1156,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
536 pxor \TMP6, \XMMDst # reduced result is in XMMDst
537 .endm
538
539-/* Encryption of a single block done*/
540-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
541
542- pxor (%arg1), \XMM0
543- movaps 16(%arg1), \TMP1
544- AESENC \TMP1, \XMM0
545- movaps 32(%arg1), \TMP1
546- AESENC \TMP1, \XMM0
547- movaps 48(%arg1), \TMP1
548- AESENC \TMP1, \XMM0
549- movaps 64(%arg1), \TMP1
550- AESENC \TMP1, \XMM0
551- movaps 80(%arg1), \TMP1
552- AESENC \TMP1, \XMM0
553- movaps 96(%arg1), \TMP1
554- AESENC \TMP1, \XMM0
555- movaps 112(%arg1), \TMP1
556- AESENC \TMP1, \XMM0
557- movaps 128(%arg1), \TMP1
558- AESENC \TMP1, \XMM0
559- movaps 144(%arg1), \TMP1
560- AESENC \TMP1, \XMM0
561- movaps 160(%arg1), \TMP1
562- AESENCLAST \TMP1, \XMM0
563-.endm
564+/* Encryption of a single block
565+* uses eax & r10
566+*/
567
568+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
569
570+ pxor (%arg1), \XMM0
571+ mov keysize,%eax
572+ shr $2,%eax # 128->4, 192->6, 256->8
573+ add $5,%eax # 128->9, 192->11, 256->13
574+ lea 16(%arg1), %r10 # get first expanded key address
575+
576+_esb_loop_\@:
577+ MOVADQ (%r10),\TMP1
578+ AESENC \TMP1,\XMM0
579+ add $16,%r10
580+ sub $1,%eax
581+ jnz _esb_loop_\@
582+
583+ MOVADQ (%r10),\TMP1
584+ AESENCLAST \TMP1,\XMM0
585+.endm
586 /*****************************************************************************
587 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
588 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
589diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
590index 6d4faba..bfaf817 100644
591--- a/arch/x86/crypto/aesni-intel_glue.c
592+++ b/arch/x86/crypto/aesni-intel_glue.c
593@@ -177,7 +177,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
594 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
595 u8 *auth_tag, unsigned long auth_tag_len)
596 {
597- if (plaintext_len < AVX_GEN2_OPTSIZE) {
598+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
599+ if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
600 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
601 aad_len, auth_tag, auth_tag_len);
602 } else {
603@@ -192,7 +193,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out,
604 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
605 u8 *auth_tag, unsigned long auth_tag_len)
606 {
607- if (ciphertext_len < AVX_GEN2_OPTSIZE) {
608+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
609+ if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
610 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
611 aad_len, auth_tag, auth_tag_len);
612 } else {
613@@ -226,7 +228,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
614 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
615 u8 *auth_tag, unsigned long auth_tag_len)
616 {
617- if (plaintext_len < AVX_GEN2_OPTSIZE) {
618+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
619+ if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
620 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
621 aad_len, auth_tag, auth_tag_len);
622 } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
623@@ -245,7 +248,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
624 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
625 u8 *auth_tag, unsigned long auth_tag_len)
626 {
627- if (ciphertext_len < AVX_GEN2_OPTSIZE) {
628+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
629+ if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
630 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
631 aad, aad_len, auth_tag, auth_tag_len);
632 } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
633@@ -878,7 +882,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
634 }
635 /*Account for 4 byte nonce at the end.*/
636 key_len -= 4;
637- if (key_len != AES_KEYSIZE_128) {
638+ if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
639+ key_len != AES_KEYSIZE_256) {
640 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
641 return -EINVAL;
642 }
643@@ -989,6 +994,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
644 __be32 counter = cpu_to_be32(1);
645 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
646 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
647+ u32 key_len = ctx->aes_key_expanded.key_length;
648 void *aes_ctx = &(ctx->aes_key_expanded);
649 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
650 u8 iv_tab[16+AESNI_ALIGN];
651@@ -1003,6 +1009,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
652 /* to 8 or 12 bytes */
653 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
654 return -EINVAL;
655+ if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
656+ return -EINVAL;
657+ if (unlikely(key_len != AES_KEYSIZE_128 &&
658+ key_len != AES_KEYSIZE_192 &&
659+ key_len != AES_KEYSIZE_256))
660+ return -EINVAL;
661+
662 /* IV below built */
663 for (i = 0; i < 4; i++)
664 *(iv+i) = ctx->nonce[i];
665@@ -1067,6 +1080,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
666 int retval = 0;
667 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
668 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
669+ u32 key_len = ctx->aes_key_expanded.key_length;
670 void *aes_ctx = &(ctx->aes_key_expanded);
671 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
672 u8 iv_and_authTag[32+AESNI_ALIGN];
673@@ -1080,6 +1094,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
674 if (unlikely((req->cryptlen < auth_tag_len) ||
675 (req->assoclen != 8 && req->assoclen != 12)))
676 return -EINVAL;
677+ if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
678+ return -EINVAL;
679+ if (unlikely(key_len != AES_KEYSIZE_128 &&
680+ key_len != AES_KEYSIZE_192 &&
681+ key_len != AES_KEYSIZE_256))
682+ return -EINVAL;
683+
684 /* Assuming we are supporting rfc4106 64-bit extended */
685 /* sequence numbers We need to have the AAD length */
686 /* equal to 8 or 12 bytes */
687--
6882.7.4
689