Fixes for 5.10

author Sasha Levin <sashal@kernel.org>

Thu, 18 Mar 2021 13:39:13 +0000 (09:39 -0400)

committer Sasha Levin <sashal@kernel.org>

Thu, 18 Mar 2021 13:39:13 +0000 (09:39 -0400)
author Sasha Levin <sashal@kernel.org>
Thu, 18 Mar 2021 13:39:13 +0000 (09:39 -0400)
committer Sasha Levin <sashal@kernel.org>
Thu, 18 Mar 2021 13:39:13 +0000 (09:39 -0400)
diff --git a/queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch b/queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch

new file mode 100644 (file)

index 0000000..7ba7361
--- /dev/null
+++ b/queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch
@@ -0,0 +1,214 @@
+From c11104b3bf91d5502fe7f0da2b48a3caebad5dc2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Nov 2020 10:44:52 +0100
+Subject: crypto: aesni - Use TEST %reg,%reg instead of CMP $0,%reg
+
+From: Uros Bizjak <ubizjak@gmail.com>
+
+[ Upstream commit 032d049ea0f45b45c21f3f02b542aa18bc6b6428 ]
+
+CMP $0,%reg can't set overflow flag, so we can use shorter TEST %reg,%reg
+instruction when only zero and sign flags are checked (E,L,LE,G,GE conditions).
+
+Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/crypto/aesni-intel_asm.S        | 20 ++++++++++----------
+ arch/x86/crypto/aesni-intel_avx-x86_64.S | 20 ++++++++++----------
+ 2 files changed, 20 insertions(+), 20 deletions(-)
+
+diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
+index 1852b19a73a0..d1436c37008b 100644
+--- a/arch/x86/crypto/aesni-intel_asm.S
++++ b/arch/x86/crypto/aesni-intel_asm.S
+@@ -318,7 +318,7 @@ _initial_blocks_\@:
+ 
+       # Main loop - Encrypt/Decrypt remaining blocks
+ 
+-      cmp     $0, %r13
++      test    %r13, %r13
+       je      _zero_cipher_left_\@
+       sub     $64, %r13
+       je      _four_cipher_left_\@
+@@ -437,7 +437,7 @@ _multiple_of_16_bytes_\@:
+ 
+       mov PBlockLen(%arg2), %r12
+ 
+-      cmp $0, %r12
++      test %r12, %r12
+       je _partial_done\@
+ 
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+@@ -474,7 +474,7 @@ _T_8_\@:
+       add     $8, %r10
+       sub     $8, %r11
+       psrldq  $8, %xmm0
+-      cmp     $0, %r11
++      test    %r11, %r11
+       je      _return_T_done_\@
+ _T_4_\@:
+       movd    %xmm0, %eax
+@@ -482,7 +482,7 @@ _T_4_\@:
+       add     $4, %r10
+       sub     $4, %r11
+       psrldq  $4, %xmm0
+-      cmp     $0, %r11
++      test    %r11, %r11
+       je      _return_T_done_\@
+ _T_123_\@:
+       movd    %xmm0, %eax
+@@ -619,7 +619,7 @@ _get_AAD_blocks\@:
+ 
+       /* read the last <16B of AAD */
+ _get_AAD_rest\@:
+-      cmp        $0, %r11
++      test       %r11, %r11
+       je         _get_AAD_done\@
+ 
+       READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
+@@ -640,7 +640,7 @@ _get_AAD_done\@:
+ .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+       AAD_HASH operation
+       mov     PBlockLen(%arg2), %r13
+-      cmp     $0, %r13
++      test    %r13, %r13
+       je      _partial_block_done_\@  # Leave Macro if no partial blocks
+       # Read in input data without over reading
+       cmp     $16, \PLAIN_CYPH_LEN
+@@ -692,7 +692,7 @@ _no_extra_mask_1_\@:
+       pshufb  %xmm2, %xmm3
+       pxor    %xmm3, \AAD_HASH
+ 
+-      cmp     $0, %r10
++      test    %r10, %r10
+       jl      _partial_incomplete_1_\@
+ 
+       # GHASH computation for the last <16 Byte block
+@@ -727,7 +727,7 @@ _no_extra_mask_2_\@:
+       pshufb  %xmm2, %xmm9
+       pxor    %xmm9, \AAD_HASH
+ 
+-      cmp     $0, %r10
++      test    %r10, %r10
+       jl      _partial_incomplete_2_\@
+ 
+       # GHASH computation for the last <16 Byte block
+@@ -747,7 +747,7 @@ _encode_done_\@:
+       pshufb  %xmm2, %xmm9
+ .endif
+       # output encrypted Bytes
+-      cmp     $0, %r10
++      test    %r10, %r10
+       jl      _partial_fill_\@
+       mov     %r13, %r12
+       mov     $16, %r13
+@@ -2720,7 +2720,7 @@ SYM_FUNC_END(aesni_ctr_enc)
+  */
+ SYM_FUNC_START(aesni_xts_crypt8)
+       FRAME_BEGIN
+-      cmpb $0, %cl
++      testb %cl, %cl
+       movl $0, %ecx
+       movl $240, %r10d
+       leaq _aesni_enc4, %r11
+diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
+index 5fee47956f3b..2cf8e94d986a 100644
+--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
++++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
+@@ -369,7 +369,7 @@ _initial_num_blocks_is_0\@:
+ 
+ 
+ _initial_blocks_encrypted\@:
+-        cmp     $0, %r13
++        test    %r13, %r13
+         je      _zero_cipher_left\@
+ 
+         sub     $128, %r13
+@@ -528,7 +528,7 @@ _multiple_of_16_bytes\@:
+         vmovdqu HashKey(arg2), %xmm13
+ 
+         mov PBlockLen(arg2), %r12
+-        cmp $0, %r12
++        test %r12, %r12
+         je _partial_done\@
+ 
+       #GHASH computation for the last <16 Byte block
+@@ -573,7 +573,7 @@ _T_8\@:
+         add     $8, %r10
+         sub     $8, %r11
+         vpsrldq $8, %xmm9, %xmm9
+-        cmp     $0, %r11
++        test    %r11, %r11
+         je     _return_T_done\@
+ _T_4\@:
+         vmovd   %xmm9, %eax
+@@ -581,7 +581,7 @@ _T_4\@:
+         add     $4, %r10
+         sub     $4, %r11
+         vpsrldq     $4, %xmm9, %xmm9
+-        cmp     $0, %r11
++        test    %r11, %r11
+         je     _return_T_done\@
+ _T_123\@:
+         vmovd     %xmm9, %eax
+@@ -625,7 +625,7 @@ _get_AAD_blocks\@:
+       cmp     $16, %r11
+       jge     _get_AAD_blocks\@
+       vmovdqu \T8, \T7
+-      cmp     $0, %r11
++      test    %r11, %r11
+       je      _get_AAD_done\@
+ 
+       vpxor   \T7, \T7, \T7
+@@ -644,7 +644,7 @@ _get_AAD_rest8\@:
+       vpxor   \T1, \T7, \T7
+       jmp     _get_AAD_rest8\@
+ _get_AAD_rest4\@:
+-      cmp     $0, %r11
++      test    %r11, %r11
+       jle      _get_AAD_rest0\@
+       mov     (%r10), %eax
+       movq    %rax, \T1
+@@ -749,7 +749,7 @@ _done_read_partial_block_\@:
+ .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+         AAD_HASH ENC_DEC
+         mov   PBlockLen(arg2), %r13
+-        cmp   $0, %r13
++        test  %r13, %r13
+         je    _partial_block_done_\@  # Leave Macro if no partial blocks
+         # Read in input data without over reading
+         cmp   $16, \PLAIN_CYPH_LEN
+@@ -801,7 +801,7 @@ _no_extra_mask_1_\@:
+         vpshufb       %xmm2, %xmm3, %xmm3
+         vpxor %xmm3, \AAD_HASH, \AAD_HASH
+ 
+-        cmp   $0, %r10
++        test  %r10, %r10
+         jl    _partial_incomplete_1_\@
+ 
+         # GHASH computation for the last <16 Byte block
+@@ -836,7 +836,7 @@ _no_extra_mask_2_\@:
+         vpshufb %xmm2, %xmm9, %xmm9
+         vpxor %xmm9, \AAD_HASH, \AAD_HASH
+ 
+-        cmp   $0, %r10
++        test  %r10, %r10
+         jl    _partial_incomplete_2_\@
+ 
+         # GHASH computation for the last <16 Byte block
+@@ -856,7 +856,7 @@ _encode_done_\@:
+         vpshufb       %xmm2, %xmm9, %xmm9
+ .endif
+         # output encrypted Bytes
+-        cmp   $0, %r10
++        test  %r10, %r10
+         jl    _partial_fill_\@
+         mov   %r13, %r12
+         mov   $16, %r13
+-- 
+2.30.1
+
diff --git a/queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch b/queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch

new file mode 100644 (file)

index 0000000..8310e7a
--- /dev/null
+++ b/queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch
@@ -0,0 +1,278 @@
+From fcbbb7a6484bd5ae998197af4bb02dec2f495414 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 31 Dec 2020 17:41:54 +0100
+Subject: crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+[ Upstream commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 ]
+
+The XTS asm helper arrangement is a bit odd: the 8-way stride helper
+consists of back-to-back calls to the 4-way core transforms, which
+are called indirectly, based on a boolean that indicates whether we
+are performing encryption or decryption.
+
+Given how costly indirect calls are on x86, let's switch to direct
+calls, and given how the 8-way stride doesn't really add anything
+substantial, use a 4-way stride instead, and make the asm core
+routine deal with any multiple of 4 blocks. Since 512 byte sectors
+or 4 KB blocks are the typical quantities XTS operates on, increase
+the stride exported to the glue helper to 512 bytes as well.
+
+As a result, the number of indirect calls is reduced from 3 per 64 bytes
+of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
+when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)
+
+Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
+Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/crypto/aesni-intel_asm.S  | 115 ++++++++++++++++++-----------
+ arch/x86/crypto/aesni-intel_glue.c |  25 ++++---
+ 2 files changed, 84 insertions(+), 56 deletions(-)
+
+diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
+index d1436c37008b..57aef3f5a81e 100644
+--- a/arch/x86/crypto/aesni-intel_asm.S
++++ b/arch/x86/crypto/aesni-intel_asm.S
+@@ -2715,25 +2715,18 @@ SYM_FUNC_END(aesni_ctr_enc)
+       pxor CTR, IV;
+ 
+ /*
+- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
+- *                     const u8 *src, bool enc, le128 *iv)
++ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
++ *                      const u8 *src, unsigned int len, le128 *iv)
+  */
+-SYM_FUNC_START(aesni_xts_crypt8)
++SYM_FUNC_START(aesni_xts_encrypt)
+       FRAME_BEGIN
+-      testb %cl, %cl
+-      movl $0, %ecx
+-      movl $240, %r10d
+-      leaq _aesni_enc4, %r11
+-      leaq _aesni_dec4, %rax
+-      cmovel %r10d, %ecx
+-      cmoveq %rax, %r11
+ 
+       movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+       movups (IVP), IV
+ 
+       mov 480(KEYP), KLEN
+-      addq %rcx, KEYP
+ 
++.Lxts_enc_loop4:
+       movdqa IV, STATE1
+       movdqu 0x00(INP), INC
+       pxor INC, STATE1
+@@ -2757,71 +2750,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
+       pxor INC, STATE4
+       movdqu IV, 0x30(OUTP)
+ 
+-      CALL_NOSPEC r11
++      call _aesni_enc4
+ 
+       movdqu 0x00(OUTP), INC
+       pxor INC, STATE1
+       movdqu STATE1, 0x00(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE1
+-      movdqu 0x40(INP), INC
+-      pxor INC, STATE1
+-      movdqu IV, 0x40(OUTP)
+-
+       movdqu 0x10(OUTP), INC
+       pxor INC, STATE2
+       movdqu STATE2, 0x10(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE2
+-      movdqu 0x50(INP), INC
+-      pxor INC, STATE2
+-      movdqu IV, 0x50(OUTP)
+-
+       movdqu 0x20(OUTP), INC
+       pxor INC, STATE3
+       movdqu STATE3, 0x20(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE3
+-      movdqu 0x60(INP), INC
+-      pxor INC, STATE3
+-      movdqu IV, 0x60(OUTP)
+-
+       movdqu 0x30(OUTP), INC
+       pxor INC, STATE4
+       movdqu STATE4, 0x30(OUTP)
+ 
+       _aesni_gf128mul_x_ble()
+-      movdqa IV, STATE4
+-      movdqu 0x70(INP), INC
+-      pxor INC, STATE4
+-      movdqu IV, 0x70(OUTP)
+ 
+-      _aesni_gf128mul_x_ble()
++      add $64, INP
++      add $64, OUTP
++      sub $64, LEN
++      ja .Lxts_enc_loop4
++
+       movups IV, (IVP)
+ 
+-      CALL_NOSPEC r11
++      FRAME_END
++      ret
++SYM_FUNC_END(aesni_xts_encrypt)
++
++/*
++ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
++ *                      const u8 *src, unsigned int len, le128 *iv)
++ */
++SYM_FUNC_START(aesni_xts_decrypt)
++      FRAME_BEGIN
++
++      movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
++      movups (IVP), IV
++
++      mov 480(KEYP), KLEN
++      add $240, KEYP
+ 
+-      movdqu 0x40(OUTP), INC
++.Lxts_dec_loop4:
++      movdqa IV, STATE1
++      movdqu 0x00(INP), INC
+       pxor INC, STATE1
+-      movdqu STATE1, 0x40(OUTP)
++      movdqu IV, 0x00(OUTP)
+ 
+-      movdqu 0x50(OUTP), INC
++      _aesni_gf128mul_x_ble()
++      movdqa IV, STATE2
++      movdqu 0x10(INP), INC
++      pxor INC, STATE2
++      movdqu IV, 0x10(OUTP)
++
++      _aesni_gf128mul_x_ble()
++      movdqa IV, STATE3
++      movdqu 0x20(INP), INC
++      pxor INC, STATE3
++      movdqu IV, 0x20(OUTP)
++
++      _aesni_gf128mul_x_ble()
++      movdqa IV, STATE4
++      movdqu 0x30(INP), INC
++      pxor INC, STATE4
++      movdqu IV, 0x30(OUTP)
++
++      call _aesni_dec4
++
++      movdqu 0x00(OUTP), INC
++      pxor INC, STATE1
++      movdqu STATE1, 0x00(OUTP)
++
++      movdqu 0x10(OUTP), INC
+       pxor INC, STATE2
+-      movdqu STATE2, 0x50(OUTP)
++      movdqu STATE2, 0x10(OUTP)
+ 
+-      movdqu 0x60(OUTP), INC
++      movdqu 0x20(OUTP), INC
+       pxor INC, STATE3
+-      movdqu STATE3, 0x60(OUTP)
++      movdqu STATE3, 0x20(OUTP)
+ 
+-      movdqu 0x70(OUTP), INC
++      movdqu 0x30(OUTP), INC
+       pxor INC, STATE4
+-      movdqu STATE4, 0x70(OUTP)
++      movdqu STATE4, 0x30(OUTP)
++
++      _aesni_gf128mul_x_ble()
++
++      add $64, INP
++      add $64, OUTP
++      sub $64, LEN
++      ja .Lxts_dec_loop4
++
++      movups IV, (IVP)
+ 
+       FRAME_END
+       ret
+-SYM_FUNC_END(aesni_xts_crypt8)
++SYM_FUNC_END(aesni_xts_decrypt)
+ 
+ #endif
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index f9a1d98e7534..be891fdf8d17 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ #define AVX_GEN2_OPTSIZE 640
+ #define AVX_GEN4_OPTSIZE 4096
+ 
++asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
++                                const u8 *in, unsigned int len, u8 *iv);
++
++asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
++                                const u8 *in, unsigned int len, u8 *iv);
++
+ #ifdef CONFIG_X86_64
+ 
+ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+                             const u8 *in, unsigned int len, u8 *iv);
+ 
+-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
+-                               const u8 *in, bool enc, le128 *iv);
+-
+ /* asmlinkage void aesni_gcm_enc()
+  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+  * struct gcm_context_data.  May be uninitialized.
+@@ -547,14 +550,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+       glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
+ }
+ 
+-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
++static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ {
+-      aesni_xts_crypt8(ctx, dst, src, true, iv);
++      aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+ }
+ 
+-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
++static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ {
+-      aesni_xts_crypt8(ctx, dst, src, false, iv);
++      aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+ }
+ 
+ static const struct common_glue_ctx aesni_enc_xts = {
+@@ -562,8 +565,8 @@ static const struct common_glue_ctx aesni_enc_xts = {
+       .fpu_blocks_limit = 1,
+ 
+       .funcs = { {
+-              .num_blocks = 8,
+-              .fn_u = { .xts = aesni_xts_enc8 }
++              .num_blocks = 32,
++              .fn_u = { .xts = aesni_xts_enc32 }
+       }, {
+               .num_blocks = 1,
+               .fn_u = { .xts = aesni_xts_enc }
+@@ -575,8 +578,8 @@ static const struct common_glue_ctx aesni_dec_xts = {
+       .fpu_blocks_limit = 1,
+ 
+       .funcs = { {
+-              .num_blocks = 8,
+-              .fn_u = { .xts = aesni_xts_dec8 }
++              .num_blocks = 32,
++              .fn_u = { .xts = aesni_xts_dec32 }
+       }, {
+               .num_blocks = 1,
+               .fn_u = { .xts = aesni_xts_dec }
+-- 
+2.30.1
+
diff --git a/queue-5.10/series b/queue-5.10/series

new file mode 100644 (file)

index 0000000..80b512a
--- /dev/null
+++ b/queue-5.10/series
@@ -0,0 +1,2 @@
+crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch
+crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch
author	Sasha Levin <sashal@kernel.org>
	Thu, 18 Mar 2021 13:39:13 +0000 (09:39 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Thu, 18 Mar 2021 13:39:13 +0000 (09:39 -0400)
queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series	[new file with mode: 0644]	patch \| blob