src/patches/linux/0100-crypto-aesni-Add-support-for-192-256-bit-keys-to-AES.patch

   1 From bcdbd313c0e6fd630a8945fd58dc5383631dc6dd Mon Sep 17 00:00:00 2001
   2 From: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
   3 Date: Tue, 13 Jan 2015 13:16:43 -0500
   4 Subject: [PATCH] crypto: aesni - Add support for 192 & 256 bit keys to AESNI
   5  RFC4106
   6
   7 These patches fix the RFC4106 implementation in the aesni-intel
   8 module so it supports 192 & 256 bit keys.
   9
  10 Since the AVX support that was added to this module also only
  11 supports 128 bit keys, and this patch only affects the SSE
  12 implementation, changes were also made to use the SSE version
  13 if key sizes other than 128 are specified.
  14
  15 RFC4106 specifies that 192 & 256 bit keys must be supported (section
  16 8.4).
  17
  18 Also, this should fix Strongswan issue 341 where the aesni module
  19 needs to be unloaded if 256 bit keys are used:
  20
  21 http://wiki.strongswan.org/issues/341
  22
  23 This patch has been tested with Sandy Bridge and Haswell processors.
  24 With 128 bit keys and input buffers > 512 bytes a slight performance
  25 degradation was noticed (~1%).  For input buffers of less than 512
  26 bytes there was no performance impact.  Compared to 128 bit keys,
  27 256 bit key size performance is approx. .5 cycles per byte slower
  28 on Sandy Bridge, and .37 cycles per byte slower on Haswell (vs.
  29 SSE code).
  30
  31 This patch has also been tested with StrongSwan IPSec connections
  32 where it worked correctly.
  33
  34 I created this diff from a git clone of crypto-2.6.git.
  35
  36 Any questions, please feel free to contact me.
  37
  38 Signed-off-by: Timothy McCaffrey <timothy.mccaffrey@unisys.com>
  39 Signed-off-by: Jarod Wilson <jarod@redhat.com>
  40 Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  41 ---
  42  arch/x86/crypto/aesni-intel_asm.S  | 342 +++++++++++++++++++------------------
  43  arch/x86/crypto/aesni-intel_glue.c |  31 +++-
  44  2 files changed, 202 insertions(+), 171 deletions(-)
  45
  46 diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
  47 index c92c7d8..f5cdfbf 100644
  48 --- a/arch/x86/crypto/aesni-intel_asm.S
  49 +++ b/arch/x86/crypto/aesni-intel_asm.S
  50 @@ -33,12 +33,23 @@
  51  #include <asm/inst.h>
  52  #include <asm/alternative-asm.h>
  53
  54 +/*
  55 + * The following macros are used to move an (un)aligned 16 byte value to/from
  56 + * an XMM register.  This can done for either FP or integer values, for FP use
  57 + * movaps (move aligned packed single) or integer use movdqa (move double quad
  58 + * aligned).  It doesn't make a performance difference which instruction is used
  59 + * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  60 + * shorter, so that is the one we'll use for now. (same for unaligned).
  61 + */
  62 +#define MOVADQ movaps
  63 +#define MOVUDQ movups
  64 +
  65  #ifdef __x86_64__
  66 +
  67  .data
  68  .align 16
  69  .Lgf128mul_x_ble_mask:
  70         .octa 0x00000000000000010000000000000087
  71 -
  72  POLY:   .octa 0xC2000000000000000000000000000001
  73  TWOONE: .octa 0x00000001000000000000000000000001
  74
  75 @@ -90,6 +101,7 @@ enc:        .octa 0x2
  76  #define arg8 STACK_OFFSET+16(%r14)
  77  #define arg9 STACK_OFFSET+24(%r14)
  78  #define arg10 STACK_OFFSET+32(%r14)
  79 +#define keysize 2*15*16(%arg1)
  80  #endif
  81
  82
  83 @@ -214,10 +226,12 @@ enc:        .octa 0x2
  84
  85  .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  86  XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  87 +        MOVADQ     SHUF_MASK(%rip), %xmm14
  88         mov        arg7, %r10           # %r10 = AAD
  89         mov        arg8, %r15           # %r15 = aadLen
  90         mov        %r15, %r11
  91         pxor       %xmm\i, %xmm\i
  92 +
  93  _get_AAD_loop\num_initial_blocks\operation:
  94         movd       (%r10), \TMP1
  95         pslldq     $12, \TMP1
  96 @@ -226,6 +240,7 @@ _get_AAD_loop\num_initial_blocks\operation:
  97         add        $4, %r10
  98         sub        $4, %r15
  99         jne        _get_AAD_loop\num_initial_blocks\operation
 100 +
 101         cmp        $16, %r11
 102         je         _get_AAD_loop2_done\num_initial_blocks\operation
 103         mov        $16, %r15
 104 @@ -234,8 +249,8 @@ _get_AAD_loop2\num_initial_blocks\operation:
 105         sub        $4, %r15
 106         cmp        %r11, %r15
 107         jne        _get_AAD_loop2\num_initial_blocks\operation
 108 +
 109  _get_AAD_loop2_done\num_initial_blocks\operation:
 110 -        movdqa     SHUF_MASK(%rip), %xmm14
 111         PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 112
 113         xor        %r11, %r11 # initialise the data pointer offset as zero
 114 @@ -244,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 115
 116         mov        %arg5, %rax                      # %rax = *Y0
 117         movdqu     (%rax), \XMM0                    # XMM0 = Y0
 118 -        movdqa     SHUF_MASK(%rip), %xmm14
 119         PSHUFB_XMM   %xmm14, \XMM0
 120
 121  .if (\i == 5) || (\i == 6) || (\i == 7)
 122 +       MOVADQ          ONE(%RIP),\TMP1
 123 +       MOVADQ          (%arg1),\TMP2
 124  .irpc index, \i_seq
 125 -       paddd      ONE(%rip), \XMM0                 # INCR Y0
 126 +       paddd      \TMP1, \XMM0                 # INCR Y0
 127         movdqa     \XMM0, %xmm\index
 128 -        movdqa     SHUF_MASK(%rip), %xmm14
 129         PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 130 -
 131 -.endr
 132 -.irpc index, \i_seq
 133 -       pxor       16*0(%arg1), %xmm\index
 134 -.endr
 135 -.irpc index, \i_seq
 136 -       movaps 0x10(%rdi), \TMP1
 137 -       AESENC     \TMP1, %xmm\index          # Round 1
 138 -.endr
 139 -.irpc index, \i_seq
 140 -       movaps 0x20(%arg1), \TMP1
 141 -       AESENC     \TMP1, %xmm\index          # Round 2
 142 +       pxor       \TMP2, %xmm\index
 143  .endr
 144 -.irpc index, \i_seq
 145 -       movaps 0x30(%arg1), \TMP1
 146 -       AESENC     \TMP1, %xmm\index          # Round 2
 147 -.endr
 148 -.irpc index, \i_seq
 149 -       movaps 0x40(%arg1), \TMP1
 150 -       AESENC     \TMP1, %xmm\index          # Round 2
 151 -.endr
 152 -.irpc index, \i_seq
 153 -       movaps 0x50(%arg1), \TMP1
 154 -       AESENC     \TMP1, %xmm\index          # Round 2
 155 -.endr
 156 -.irpc index, \i_seq
 157 -       movaps 0x60(%arg1), \TMP1
 158 -       AESENC     \TMP1, %xmm\index          # Round 2
 159 -.endr
 160 -.irpc index, \i_seq
 161 -       movaps 0x70(%arg1), \TMP1
 162 -       AESENC     \TMP1, %xmm\index          # Round 2
 163 -.endr
 164 -.irpc index, \i_seq
 165 -       movaps 0x80(%arg1), \TMP1
 166 -       AESENC     \TMP1, %xmm\index          # Round 2
 167 -.endr
 168 -.irpc index, \i_seq
 169 -       movaps 0x90(%arg1), \TMP1
 170 -       AESENC     \TMP1, %xmm\index          # Round 2
 171 +       lea     0x10(%arg1),%r10
 172 +       mov     keysize,%eax
 173 +       shr     $2,%eax                         # 128->4, 192->6, 256->8
 174 +       add     $5,%eax                       # 128->9, 192->11, 256->13
 175 +
 176 +aes_loop_initial_dec\num_initial_blocks:
 177 +       MOVADQ  (%r10),\TMP1
 178 +.irpc  index, \i_seq
 179 +       AESENC  \TMP1, %xmm\index
 180  .endr
 181 +       add     $16,%r10
 182 +       sub     $1,%eax
 183 +       jnz     aes_loop_initial_dec\num_initial_blocks
 184 +
 185 +       MOVADQ  (%r10), \TMP1
 186  .irpc index, \i_seq
 187 -       movaps 0xa0(%arg1), \TMP1
 188 -       AESENCLAST \TMP1, %xmm\index         # Round 10
 189 +       AESENCLAST \TMP1, %xmm\index         # Last Round
 190  .endr
 191  .irpc index, \i_seq
 192         movdqu     (%arg3 , %r11, 1), \TMP1
 193 @@ -306,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 194         add        $16, %r11
 195
 196         movdqa     \TMP1, %xmm\index
 197 -        movdqa     SHUF_MASK(%rip), %xmm14
 198         PSHUFB_XMM         %xmm14, %xmm\index
 199 -
 200 -               # prepare plaintext/ciphertext for GHASH computation
 201 +                # prepare plaintext/ciphertext for GHASH computation
 202  .endr
 203  .endif
 204         GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 205 @@ -339,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 206  * Precomputations for HashKey parallel with encryption of first 4 blocks.
 207  * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 208  */
 209 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 210 -       movdqa     \XMM0, \XMM1
 211 -        movdqa     SHUF_MASK(%rip), %xmm14
 212 +       MOVADQ     ONE(%rip), \TMP1
 213 +       paddd      \TMP1, \XMM0              # INCR Y0
 214 +       MOVADQ     \XMM0, \XMM1
 215         PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 216
 217 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 218 -       movdqa     \XMM0, \XMM2
 219 -        movdqa     SHUF_MASK(%rip), %xmm14
 220 +       paddd      \TMP1, \XMM0              # INCR Y0
 221 +       MOVADQ     \XMM0, \XMM2
 222         PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 223
 224 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 225 -       movdqa     \XMM0, \XMM3
 226 -        movdqa     SHUF_MASK(%rip), %xmm14
 227 +       paddd      \TMP1, \XMM0              # INCR Y0
 228 +       MOVADQ     \XMM0, \XMM3
 229         PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 230
 231 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 232 -       movdqa     \XMM0, \XMM4
 233 -        movdqa     SHUF_MASK(%rip), %xmm14
 234 +       paddd      \TMP1, \XMM0              # INCR Y0
 235 +       MOVADQ     \XMM0, \XMM4
 236         PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 237
 238 -       pxor       16*0(%arg1), \XMM1
 239 -       pxor       16*0(%arg1), \XMM2
 240 -       pxor       16*0(%arg1), \XMM3
 241 -       pxor       16*0(%arg1), \XMM4
 242 +       MOVADQ     0(%arg1),\TMP1
 243 +       pxor       \TMP1, \XMM1
 244 +       pxor       \TMP1, \XMM2
 245 +       pxor       \TMP1, \XMM3
 246 +       pxor       \TMP1, \XMM4
 247         movdqa     \TMP3, \TMP5
 248         pshufd     $78, \TMP3, \TMP1
 249         pxor       \TMP3, \TMP1
 250 @@ -400,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 251         pshufd     $78, \TMP5, \TMP1
 252         pxor       \TMP5, \TMP1
 253         movdqa     \TMP1, HashKey_4_k(%rsp)
 254 -       movaps 0xa0(%arg1), \TMP2
 255 +       lea        0xa0(%arg1),%r10
 256 +       mov        keysize,%eax
 257 +       shr        $2,%eax                      # 128->4, 192->6, 256->8
 258 +       sub        $4,%eax                      # 128->0, 192->2, 256->4
 259 +       jz         aes_loop_pre_dec_done\num_initial_blocks
 260 +
 261 +aes_loop_pre_dec\num_initial_blocks:
 262 +       MOVADQ     (%r10),\TMP2
 263 +.irpc  index, 1234
 264 +       AESENC     \TMP2, %xmm\index
 265 +.endr
 266 +       add        $16,%r10
 267 +       sub        $1,%eax
 268 +       jnz        aes_loop_pre_dec\num_initial_blocks
 269 +
 270 +aes_loop_pre_dec_done\num_initial_blocks:
 271 +       MOVADQ     (%r10), \TMP2
 272         AESENCLAST \TMP2, \XMM1
 273         AESENCLAST \TMP2, \XMM2
 274         AESENCLAST \TMP2, \XMM3
 275 @@ -422,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 276         movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 277         movdqa     \TMP1, \XMM4
 278         add        $64, %r11
 279 -        movdqa     SHUF_MASK(%rip), %xmm14
 280         PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 281         pxor       \XMMDst, \XMM1
 282  # combine GHASHed value with the corresponding ciphertext
 283 -        movdqa     SHUF_MASK(%rip), %xmm14
 284         PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 285 -        movdqa     SHUF_MASK(%rip), %xmm14
 286         PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 287 -        movdqa     SHUF_MASK(%rip), %xmm14
 288         PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 289
 290  _initial_blocks_done\num_initial_blocks\operation:
 291 @@ -452,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:
 292
 293  .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 294  XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 295 +        MOVADQ     SHUF_MASK(%rip), %xmm14
 296         mov        arg7, %r10           # %r10 = AAD
 297         mov        arg8, %r15           # %r15 = aadLen
 298         mov        %r15, %r11
 299 @@ -473,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
 300         cmp        %r11, %r15
 301         jne        _get_AAD_loop2\num_initial_blocks\operation
 302  _get_AAD_loop2_done\num_initial_blocks\operation:
 303 -        movdqa     SHUF_MASK(%rip), %xmm14
 304         PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 305
 306         xor        %r11, %r11 # initialise the data pointer offset as zero
 307 @@ -482,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 308
 309         mov        %arg5, %rax                      # %rax = *Y0
 310         movdqu     (%rax), \XMM0                    # XMM0 = Y0
 311 -        movdqa     SHUF_MASK(%rip), %xmm14
 312         PSHUFB_XMM   %xmm14, \XMM0
 313
 314  .if (\i == 5) || (\i == 6) || (\i == 7)
 315 -.irpc index, \i_seq
 316 -       paddd      ONE(%rip), \XMM0                 # INCR Y0
 317 -       movdqa     \XMM0, %xmm\index
 318 -        movdqa     SHUF_MASK(%rip), %xmm14
 319 -       PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
 320
 321 -.endr
 322 -.irpc index, \i_seq
 323 -       pxor       16*0(%arg1), %xmm\index
 324 -.endr
 325 -.irpc index, \i_seq
 326 -       movaps 0x10(%rdi), \TMP1
 327 -       AESENC     \TMP1, %xmm\index          # Round 1
 328 -.endr
 329 -.irpc index, \i_seq
 330 -       movaps 0x20(%arg1), \TMP1
 331 -       AESENC     \TMP1, %xmm\index          # Round 2
 332 -.endr
 333 +       MOVADQ          ONE(%RIP),\TMP1
 334 +       MOVADQ          0(%arg1),\TMP2
 335  .irpc index, \i_seq
 336 -       movaps 0x30(%arg1), \TMP1
 337 -       AESENC     \TMP1, %xmm\index          # Round 2
 338 +       paddd           \TMP1, \XMM0                 # INCR Y0
 339 +       MOVADQ          \XMM0, %xmm\index
 340 +       PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
 341 +       pxor            \TMP2, %xmm\index
 342  .endr
 343 -.irpc index, \i_seq
 344 -       movaps 0x40(%arg1), \TMP1
 345 -       AESENC     \TMP1, %xmm\index          # Round 2
 346 -.endr
 347 -.irpc index, \i_seq
 348 -       movaps 0x50(%arg1), \TMP1
 349 -       AESENC     \TMP1, %xmm\index          # Round 2
 350 -.endr
 351 -.irpc index, \i_seq
 352 -       movaps 0x60(%arg1), \TMP1
 353 -       AESENC     \TMP1, %xmm\index          # Round 2
 354 -.endr
 355 -.irpc index, \i_seq
 356 -       movaps 0x70(%arg1), \TMP1
 357 -       AESENC     \TMP1, %xmm\index          # Round 2
 358 -.endr
 359 -.irpc index, \i_seq
 360 -       movaps 0x80(%arg1), \TMP1
 361 -       AESENC     \TMP1, %xmm\index          # Round 2
 362 -.endr
 363 -.irpc index, \i_seq
 364 -       movaps 0x90(%arg1), \TMP1
 365 -       AESENC     \TMP1, %xmm\index          # Round 2
 366 +       lea     0x10(%arg1),%r10
 367 +       mov     keysize,%eax
 368 +       shr     $2,%eax                         # 128->4, 192->6, 256->8
 369 +       add     $5,%eax                       # 128->9, 192->11, 256->13
 370 +
 371 +aes_loop_initial_enc\num_initial_blocks:
 372 +       MOVADQ  (%r10),\TMP1
 373 +.irpc  index, \i_seq
 374 +       AESENC  \TMP1, %xmm\index
 375  .endr
 376 +       add     $16,%r10
 377 +       sub     $1,%eax
 378 +       jnz     aes_loop_initial_enc\num_initial_blocks
 379 +
 380 +       MOVADQ  (%r10), \TMP1
 381  .irpc index, \i_seq
 382 -       movaps 0xa0(%arg1), \TMP1
 383 -       AESENCLAST \TMP1, %xmm\index         # Round 10
 384 +       AESENCLAST \TMP1, %xmm\index         # Last Round
 385  .endr
 386  .irpc index, \i_seq
 387         movdqu     (%arg3 , %r11, 1), \TMP1
 388 @@ -542,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 389         movdqu     %xmm\index, (%arg2 , %r11, 1)
 390         # write back plaintext/ciphertext for num_initial_blocks
 391         add        $16, %r11
 392 -
 393 -        movdqa     SHUF_MASK(%rip), %xmm14
 394         PSHUFB_XMM         %xmm14, %xmm\index
 395
 396                 # prepare plaintext/ciphertext for GHASH computation
 397 @@ -576,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 398  * Precomputations for HashKey parallel with encryption of first 4 blocks.
 399  * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 400  */
 401 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 402 -       movdqa     \XMM0, \XMM1
 403 -        movdqa     SHUF_MASK(%rip), %xmm14
 404 +       MOVADQ     ONE(%RIP),\TMP1
 405 +       paddd      \TMP1, \XMM0              # INCR Y0
 406 +       MOVADQ     \XMM0, \XMM1
 407         PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 408
 409 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 410 -       movdqa     \XMM0, \XMM2
 411 -        movdqa     SHUF_MASK(%rip), %xmm14
 412 +       paddd      \TMP1, \XMM0              # INCR Y0
 413 +       MOVADQ     \XMM0, \XMM2
 414         PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 415
 416 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 417 -       movdqa     \XMM0, \XMM3
 418 -        movdqa     SHUF_MASK(%rip), %xmm14
 419 +       paddd      \TMP1, \XMM0              # INCR Y0
 420 +       MOVADQ     \XMM0, \XMM3
 421         PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 422
 423 -       paddd      ONE(%rip), \XMM0              # INCR Y0
 424 -       movdqa     \XMM0, \XMM4
 425 -        movdqa     SHUF_MASK(%rip), %xmm14
 426 +       paddd      \TMP1, \XMM0              # INCR Y0
 427 +       MOVADQ     \XMM0, \XMM4
 428         PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 429
 430 -       pxor       16*0(%arg1), \XMM1
 431 -       pxor       16*0(%arg1), \XMM2
 432 -       pxor       16*0(%arg1), \XMM3
 433 -       pxor       16*0(%arg1), \XMM4
 434 +       MOVADQ     0(%arg1),\TMP1
 435 +       pxor       \TMP1, \XMM1
 436 +       pxor       \TMP1, \XMM2
 437 +       pxor       \TMP1, \XMM3
 438 +       pxor       \TMP1, \XMM4
 439         movdqa     \TMP3, \TMP5
 440         pshufd     $78, \TMP3, \TMP1
 441         pxor       \TMP3, \TMP1
 442 @@ -637,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 443         pshufd     $78, \TMP5, \TMP1
 444         pxor       \TMP5, \TMP1
 445         movdqa     \TMP1, HashKey_4_k(%rsp)
 446 -       movaps 0xa0(%arg1), \TMP2
 447 +       lea        0xa0(%arg1),%r10
 448 +       mov        keysize,%eax
 449 +       shr        $2,%eax                      # 128->4, 192->6, 256->8
 450 +       sub        $4,%eax                      # 128->0, 192->2, 256->4
 451 +       jz         aes_loop_pre_enc_done\num_initial_blocks
 452 +
 453 +aes_loop_pre_enc\num_initial_blocks:
 454 +       MOVADQ     (%r10),\TMP2
 455 +.irpc  index, 1234
 456 +       AESENC     \TMP2, %xmm\index
 457 +.endr
 458 +       add        $16,%r10
 459 +       sub        $1,%eax
 460 +       jnz        aes_loop_pre_enc\num_initial_blocks
 461 +
 462 +aes_loop_pre_enc_done\num_initial_blocks:
 463 +       MOVADQ     (%r10), \TMP2
 464         AESENCLAST \TMP2, \XMM1
 465         AESENCLAST \TMP2, \XMM2
 466         AESENCLAST \TMP2, \XMM3
 467 @@ -656,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
 468         movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
 469
 470         add        $64, %r11
 471 -        movdqa     SHUF_MASK(%rip), %xmm14
 472         PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 473         pxor       \XMMDst, \XMM1
 474  # combine GHASHed value with the corresponding ciphertext
 475 -        movdqa     SHUF_MASK(%rip), %xmm14
 476         PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 477 -        movdqa     SHUF_MASK(%rip), %xmm14
 478         PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 479 -        movdqa     SHUF_MASK(%rip), %xmm14
 480         PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 481
 482  _initial_blocks_done\num_initial_blocks\operation:
 483 @@ -795,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 484         AESENC    \TMP3, \XMM3
 485         AESENC    \TMP3, \XMM4
 486         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 487 -       movaps 0xa0(%arg1), \TMP3
 488 +       lea       0xa0(%arg1),%r10
 489 +       mov       keysize,%eax
 490 +       shr       $2,%eax                       # 128->4, 192->6, 256->8
 491 +       sub       $4,%eax                       # 128->0, 192->2, 256->4
 492 +       jz        aes_loop_par_enc_done
 493 +
 494 +aes_loop_par_enc:
 495 +       MOVADQ    (%r10),\TMP3
 496 +.irpc  index, 1234
 497 +       AESENC    \TMP3, %xmm\index
 498 +.endr
 499 +       add       $16,%r10
 500 +       sub       $1,%eax
 501 +       jnz       aes_loop_par_enc
 502 +
 503 +aes_loop_par_enc_done:
 504 +       MOVADQ    (%r10), \TMP3
 505         AESENCLAST \TMP3, \XMM1           # Round 10
 506         AESENCLAST \TMP3, \XMM2
 507         AESENCLAST \TMP3, \XMM3
 508 @@ -987,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 509         AESENC    \TMP3, \XMM3
 510         AESENC    \TMP3, \XMM4
 511         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 512 -       movaps 0xa0(%arg1), \TMP3
 513 -       AESENCLAST \TMP3, \XMM1           # Round 10
 514 +       lea       0xa0(%arg1),%r10
 515 +       mov       keysize,%eax
 516 +       shr       $2,%eax                       # 128->4, 192->6, 256->8
 517 +       sub       $4,%eax                       # 128->0, 192->2, 256->4
 518 +       jz        aes_loop_par_dec_done
 519 +
 520 +aes_loop_par_dec:
 521 +       MOVADQ    (%r10),\TMP3
 522 +.irpc  index, 1234
 523 +       AESENC    \TMP3, %xmm\index
 524 +.endr
 525 +       add       $16,%r10
 526 +       sub       $1,%eax
 527 +       jnz       aes_loop_par_dec
 528 +
 529 +aes_loop_par_dec_done:
 530 +       MOVADQ    (%r10), \TMP3
 531 +       AESENCLAST \TMP3, \XMM1           # last round
 532         AESENCLAST \TMP3, \XMM2
 533         AESENCLAST \TMP3, \XMM3
 534         AESENCLAST \TMP3, \XMM4
 535 @@ -1156,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
 536         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
 537  .endm
 538
 539 -/* Encryption of a single block done*/
 540 -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 541
 542 -       pxor    (%arg1), \XMM0
 543 -        movaps 16(%arg1), \TMP1
 544 -       AESENC  \TMP1, \XMM0
 545 -        movaps 32(%arg1), \TMP1
 546 -       AESENC  \TMP1, \XMM0
 547 -        movaps 48(%arg1), \TMP1
 548 -       AESENC  \TMP1, \XMM0
 549 -        movaps 64(%arg1), \TMP1
 550 -       AESENC  \TMP1, \XMM0
 551 -        movaps 80(%arg1), \TMP1
 552 -       AESENC  \TMP1, \XMM0
 553 -        movaps 96(%arg1), \TMP1
 554 -       AESENC  \TMP1, \XMM0
 555 -        movaps 112(%arg1), \TMP1
 556 -       AESENC  \TMP1, \XMM0
 557 -        movaps 128(%arg1), \TMP1
 558 -       AESENC  \TMP1, \XMM0
 559 -        movaps 144(%arg1), \TMP1
 560 -       AESENC  \TMP1, \XMM0
 561 -        movaps 160(%arg1), \TMP1
 562 -       AESENCLAST      \TMP1, \XMM0
 563 -.endm
 564 +/* Encryption of a single block
 565 +* uses eax & r10
 566 +*/
 567
 568 +.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
 569
 570 +       pxor            (%arg1), \XMM0
 571 +       mov             keysize,%eax
 572 +       shr             $2,%eax                 # 128->4, 192->6, 256->8
 573 +       add             $5,%eax                 # 128->9, 192->11, 256->13
 574 +       lea             16(%arg1), %r10   # get first expanded key address
 575 +
 576 +_esb_loop_\@:
 577 +       MOVADQ          (%r10),\TMP1
 578 +       AESENC          \TMP1,\XMM0
 579 +       add             $16,%r10
 580 +       sub             $1,%eax
 581 +       jnz             _esb_loop_\@
 582 +
 583 +       MOVADQ          (%r10),\TMP1
 584 +       AESENCLAST      \TMP1,\XMM0
 585 +.endm
 586  /*****************************************************************************
 587  * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
 588  *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
 589 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
 590 index 6d4faba..bfaf817 100644
 591 --- a/arch/x86/crypto/aesni-intel_glue.c
 592 +++ b/arch/x86/crypto/aesni-intel_glue.c
 593 @@ -177,7 +177,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
 594                         u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 595                         u8 *auth_tag, unsigned long auth_tag_len)
 596  {
 597 -       if (plaintext_len < AVX_GEN2_OPTSIZE) {
 598 +        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
 599 +       if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
 600                 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
 601                                 aad_len, auth_tag, auth_tag_len);
 602         } else {
 603 @@ -192,7 +193,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out,
 604                         u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 605                         u8 *auth_tag, unsigned long auth_tag_len)
 606  {
 607 -       if (ciphertext_len < AVX_GEN2_OPTSIZE) {
 608 +        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
 609 +       if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 610                 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
 611                                 aad_len, auth_tag, auth_tag_len);
 612         } else {
 613 @@ -226,7 +228,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
 614                         u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 615                         u8 *auth_tag, unsigned long auth_tag_len)
 616  {
 617 -       if (plaintext_len < AVX_GEN2_OPTSIZE) {
 618 +       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
 619 +       if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 620                 aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
 621                                 aad_len, auth_tag, auth_tag_len);
 622         } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
 623 @@ -245,7 +248,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
 624                         u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 625                         u8 *auth_tag, unsigned long auth_tag_len)
 626  {
 627 -       if (ciphertext_len < AVX_GEN2_OPTSIZE) {
 628 +       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
 629 +       if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
 630                 aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
 631                                 aad, aad_len, auth_tag, auth_tag_len);
 632         } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
 633 @@ -878,7 +882,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 634         }
 635         /*Account for 4 byte nonce at the end.*/
 636         key_len -= 4;
 637 -       if (key_len != AES_KEYSIZE_128) {
 638 +       if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
 639 +           key_len != AES_KEYSIZE_256) {
 640                 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 641                 return -EINVAL;
 642         }
 643 @@ -989,6 +994,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 644         __be32 counter = cpu_to_be32(1);
 645         struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 646         struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
 647 +       u32 key_len = ctx->aes_key_expanded.key_length;
 648         void *aes_ctx = &(ctx->aes_key_expanded);
 649         unsigned long auth_tag_len = crypto_aead_authsize(tfm);
 650         u8 iv_tab[16+AESNI_ALIGN];
 651 @@ -1003,6 +1009,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 652         /* to 8 or 12 bytes */
 653         if (unlikely(req->assoclen != 8 && req->assoclen != 12))
 654                 return -EINVAL;
 655 +       if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
 656 +               return -EINVAL;
 657 +       if (unlikely(key_len != AES_KEYSIZE_128 &&
 658 +                    key_len != AES_KEYSIZE_192 &&
 659 +                    key_len != AES_KEYSIZE_256))
 660 +               return -EINVAL;
 661 +
 662         /* IV below built */
 663         for (i = 0; i < 4; i++)
 664                 *(iv+i) = ctx->nonce[i];
 665 @@ -1067,6 +1080,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 666         int retval = 0;
 667         struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 668         struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
 669 +       u32 key_len = ctx->aes_key_expanded.key_length;
 670         void *aes_ctx = &(ctx->aes_key_expanded);
 671         unsigned long auth_tag_len = crypto_aead_authsize(tfm);
 672         u8 iv_and_authTag[32+AESNI_ALIGN];
 673 @@ -1080,6 +1094,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 674         if (unlikely((req->cryptlen < auth_tag_len) ||
 675                 (req->assoclen != 8 && req->assoclen != 12)))
 676                 return -EINVAL;
 677 +       if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
 678 +               return -EINVAL;
 679 +       if (unlikely(key_len != AES_KEYSIZE_128 &&
 680 +                    key_len != AES_KEYSIZE_192 &&
 681 +                    key_len != AES_KEYSIZE_256))
 682 +               return -EINVAL;
 683 +
 684         /* Assuming we are supporting rfc4106 64-bit extended */
 685         /* sequence numbers We need to have the AAD length */
 686         /* equal to 8 or 12 bytes */
 687 --
 688 2.7.4
 689