5.11-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 19 Mar 2021 10:50:11 +0000 (11:50 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 19 Mar 2021 10:50:11 +0000 (11:50 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 19 Mar 2021 10:50:11 +0000 (11:50 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 19 Mar 2021 10:50:11 +0000 (11:50 +0100)
diff --git a/queue-5.11/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-stride.patch b/queue-5.11/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-stride.patch

new file mode 100644 (file)

index 0000000..fbbee48
--- /dev/null
+++ b/queue-5.11/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-stride.patch
@@ -0,0 +1,271 @@
+From 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Thu, 31 Dec 2020 17:41:54 +0100
+Subject: crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 upstream.
+
+The XTS asm helper arrangement is a bit odd: the 8-way stride helper
+consists of back-to-back calls to the 4-way core transforms, which
+are called indirectly, based on a boolean that indicates whether we
+are performing encryption or decryption.
+
+Given how costly indirect calls are on x86, let's switch to direct
+calls, and given how the 8-way stride doesn't really add anything
+substantial, use a 4-way stride instead, and make the asm core
+routine deal with any multiple of 4 blocks. Since 512 byte sectors
+or 4 KB blocks are the typical quantities XTS operates on, increase
+the stride exported to the glue helper to 512 bytes as well.
+
+As a result, the number of indirect calls is reduced from 3 per 64 bytes
+of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
+when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)
+
+Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
+Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/crypto/aesni-intel_asm.S  |  115 ++++++++++++++++++++++---------------
+ arch/x86/crypto/aesni-intel_glue.c |   25 ++++----
+ 2 files changed, 84 insertions(+), 56 deletions(-)
+
+--- a/arch/x86/crypto/aesni-intel_asm.S
++++ b/arch/x86/crypto/aesni-intel_asm.S
+@@ -2715,25 +2715,18 @@ SYM_FUNC_END(aesni_ctr_enc)
+       pxor CTR, IV;
+ 
+ /*
+- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
+- *                     const u8 *src, bool enc, le128 *iv)
++ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
++ *                      const u8 *src, unsigned int len, le128 *iv)
+  */
+-SYM_FUNC_START(aesni_xts_crypt8)
++SYM_FUNC_START(aesni_xts_encrypt)
+       FRAME_BEGIN
+-      testb %cl, %cl
+-      movl $0, %ecx
+-      movl $240, %r10d
+-      leaq _aesni_enc4, %r11
+-      leaq _aesni_dec4, %rax
+-      cmovel %r10d, %ecx
+-      cmoveq %rax, %r11
+ 
+       movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+       movups (IVP), IV
+ 
+       mov 480(KEYP), KLEN
+-      addq %rcx, KEYP
+ 
++.Lxts_enc_loop4:
+       movdqa IV, STATE1
+       movdqu 0x00(INP), INC
+       pxor INC, STATE1
+@@ -2757,71 +2750,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
+       pxor INC, STATE4
+       movdqu IV, 0x30(OUTP)
+ 
+-      CALL_NOSPEC r11
++      call _aesni_enc4
+ 
+       movdqu 0x00(OUTP), INC
+       pxor INC, STATE1
+       movdqu STATE1, 0x00(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE1
+-      movdqu 0x40(INP), INC
+-      pxor INC, STATE1
+-      movdqu IV, 0x40(OUTP)
+-
+       movdqu 0x10(OUTP), INC
+       pxor INC, STATE2
+       movdqu STATE2, 0x10(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE2
+-      movdqu 0x50(INP), INC
+-      pxor INC, STATE2
+-      movdqu IV, 0x50(OUTP)
+-
+       movdqu 0x20(OUTP), INC
+       pxor INC, STATE3
+       movdqu STATE3, 0x20(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE3
+-      movdqu 0x60(INP), INC
+-      pxor INC, STATE3
+-      movdqu IV, 0x60(OUTP)
+-
+       movdqu 0x30(OUTP), INC
+       pxor INC, STATE4
+       movdqu STATE4, 0x30(OUTP)
+ 
+       _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE4
+-      movdqu 0x70(INP), INC
+-      pxor INC, STATE4
+-      movdqu IV, 0x70(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
++      add $64, INP
++      add $64, OUTP
++      sub $64, LEN
++      ja .Lxts_enc_loop4
++
+       movups IV, (IVP)
+ 
+-      CALL_NOSPEC r11
++      FRAME_END
++      ret
++SYM_FUNC_END(aesni_xts_encrypt)
++
++/*
++ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
++ *                      const u8 *src, unsigned int len, le128 *iv)
++ */
++SYM_FUNC_START(aesni_xts_decrypt)
++      FRAME_BEGIN
++
++      movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
++      movups (IVP), IV
++
++      mov 480(KEYP), KLEN
++      add $240, KEYP
++
++.Lxts_dec_loop4:
++      movdqa IV, STATE1
++      movdqu 0x00(INP), INC
++      pxor INC, STATE1
++      movdqu IV, 0x00(OUTP)
++
++      _aesni_gf128mul_x_ble()
++      movdqa IV, STATE2
++      movdqu 0x10(INP), INC
++      pxor INC, STATE2
++      movdqu IV, 0x10(OUTP)
++
++      _aesni_gf128mul_x_ble()
++      movdqa IV, STATE3
++      movdqu 0x20(INP), INC
++      pxor INC, STATE3
++      movdqu IV, 0x20(OUTP)
++
++      _aesni_gf128mul_x_ble()
++      movdqa IV, STATE4
++      movdqu 0x30(INP), INC
++      pxor INC, STATE4
++      movdqu IV, 0x30(OUTP)
++
++      call _aesni_dec4
+ 
+-      movdqu 0x40(OUTP), INC
++      movdqu 0x00(OUTP), INC
+       pxor INC, STATE1
+-      movdqu STATE1, 0x40(OUTP)
++      movdqu STATE1, 0x00(OUTP)
+ 
+-      movdqu 0x50(OUTP), INC
++      movdqu 0x10(OUTP), INC
+       pxor INC, STATE2
+-      movdqu STATE2, 0x50(OUTP)
++      movdqu STATE2, 0x10(OUTP)
+ 
+-      movdqu 0x60(OUTP), INC
++      movdqu 0x20(OUTP), INC
+       pxor INC, STATE3
+-      movdqu STATE3, 0x60(OUTP)
++      movdqu STATE3, 0x20(OUTP)
+ 
+-      movdqu 0x70(OUTP), INC
++      movdqu 0x30(OUTP), INC
+       pxor INC, STATE4
+-      movdqu STATE4, 0x70(OUTP)
++      movdqu STATE4, 0x30(OUTP)
++
++      _aesni_gf128mul_x_ble()
++
++      add $64, INP
++      add $64, OUTP
++      sub $64, LEN
++      ja .Lxts_dec_loop4
++
++      movups IV, (IVP)
+ 
+       FRAME_END
+       ret
+-SYM_FUNC_END(aesni_xts_crypt8)
++SYM_FUNC_END(aesni_xts_decrypt)
+ 
+ #endif
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct cry
+ #define AVX_GEN2_OPTSIZE 640
+ #define AVX_GEN4_OPTSIZE 4096
+ 
++asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
++                                const u8 *in, unsigned int len, u8 *iv);
++
++asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
++                                const u8 *in, unsigned int len, u8 *iv);
++
+ #ifdef CONFIG_X86_64
+ 
+ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct
+ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+                             const u8 *in, unsigned int len, u8 *iv);
+ 
+-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
+-                               const u8 *in, bool enc, le128 *iv);
+-
+ /* asmlinkage void aesni_gcm_enc()
+  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+  * struct gcm_context_data.  May be uninitialized.
+@@ -547,14 +550,14 @@ static void aesni_xts_dec(const void *ct
+       glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
+ }
+ 
+-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
++static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ {
+-      aesni_xts_crypt8(ctx, dst, src, true, iv);
++      aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+ }
+ 
+-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
++static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ {
+-      aesni_xts_crypt8(ctx, dst, src, false, iv);
++      aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+ }
+ 
+ static const struct common_glue_ctx aesni_enc_xts = {
+@@ -562,8 +565,8 @@ static const struct common_glue_ctx aesn
+       .fpu_blocks_limit = 1,
+ 
+       .funcs = { {
+-              .num_blocks = 8,
+-              .fn_u = { .xts = aesni_xts_enc8 }
++              .num_blocks = 32,
++              .fn_u = { .xts = aesni_xts_enc32 }
+       }, {
+               .num_blocks = 1,
+               .fn_u = { .xts = aesni_xts_enc }
+@@ -575,8 +578,8 @@ static const struct common_glue_ctx aesn
+       .fpu_blocks_limit = 1,
+ 
+       .funcs = { {
+-              .num_blocks = 8,
+-              .fn_u = { .xts = aesni_xts_dec8 }
++              .num_blocks = 32,
++              .fn_u = { .xts = aesni_xts_dec32 }
+       }, {
+               .num_blocks = 1,
+               .fn_u = { .xts = aesni_xts_dec }
diff --git a/queue-5.11/series b/queue-5.11/series

index 1268134b1fdf168cbf50a681861bfc9e770d370f..4bfcd3cd701b1e6d18140e5f1a67eec66bf8c5f6 100644 (file)
--- a/queue-5.11/series
+++ b/queue-5.11/series
@@ -28,3 +28,4 @@ fuse-fix-live-lock-in-fuse_iget.patch
  revert-nfsd4-remove-check_conflicting_opens-warning.patch
  revert-nfsd4-a-client-s-own-opens-needn-t-prevent-delegations.patch
  net-dsa-b53-support-setting-learning-on-port.patch
+crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-stride.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 19 Mar 2021 10:50:11 +0000 (11:50 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 19 Mar 2021 10:50:11 +0000 (11:50 +0100)
queue-5.11/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-stride.patch	[new file with mode: 0644]	patch \| blob
queue-5.11/series		patch \| blob \| blame \| history