From: Sasha Levin Date: Thu, 18 Mar 2021 13:39:13 +0000 (-0400) Subject: Fixes for 5.10 X-Git-Tag: v4.19.182~28 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=c521d9353e4821771b0b32073db9d2b370cb277a;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.10 Signed-off-by: Sasha Levin --- diff --git a/queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch b/queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch new file mode 100644 index 00000000000..7ba7361b0fb --- /dev/null +++ b/queue-5.10/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch @@ -0,0 +1,214 @@ +From c11104b3bf91d5502fe7f0da2b48a3caebad5dc2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Nov 2020 10:44:52 +0100 +Subject: crypto: aesni - Use TEST %reg,%reg instead of CMP $0,%reg + +From: Uros Bizjak + +[ Upstream commit 032d049ea0f45b45c21f3f02b542aa18bc6b6428 ] + +CMP $0,%reg can't set overflow flag, so we can use shorter TEST %reg,%reg +instruction when only zero and sign flags are checked (E,L,LE,G,GE conditions). + +Signed-off-by: Uros Bizjak +Cc: Herbert Xu +Cc: Borislav Petkov +Cc: "H. Peter Anvin" +Signed-off-by: Herbert Xu +Signed-off-by: Sasha Levin +--- + arch/x86/crypto/aesni-intel_asm.S | 20 ++++++++++---------- + arch/x86/crypto/aesni-intel_avx-x86_64.S | 20 ++++++++++---------- + 2 files changed, 20 insertions(+), 20 deletions(-) + +diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S +index 1852b19a73a0..d1436c37008b 100644 +--- a/arch/x86/crypto/aesni-intel_asm.S ++++ b/arch/x86/crypto/aesni-intel_asm.S +@@ -318,7 +318,7 @@ _initial_blocks_\@: + + # Main loop - Encrypt/Decrypt remaining blocks + +- cmp $0, %r13 ++ test %r13, %r13 + je _zero_cipher_left_\@ + sub $64, %r13 + je _four_cipher_left_\@ +@@ -437,7 +437,7 @@ _multiple_of_16_bytes_\@: + + mov PBlockLen(%arg2), %r12 + +- cmp $0, %r12 ++ test %r12, %r12 + je _partial_done\@ + + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 +@@ -474,7 +474,7 @@ _T_8_\@: + add $8, %r10 + sub $8, %r11 + psrldq $8, %xmm0 +- cmp $0, %r11 ++ test %r11, %r11 + je _return_T_done_\@ + _T_4_\@: + movd %xmm0, %eax +@@ -482,7 +482,7 @@ _T_4_\@: + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 +- cmp $0, %r11 ++ test %r11, %r11 + je _return_T_done_\@ + _T_123_\@: + movd %xmm0, %eax +@@ -619,7 +619,7 @@ _get_AAD_blocks\@: + + /* read the last <16B of AAD */ + _get_AAD_rest\@: +- cmp $0, %r11 ++ test %r11, %r11 + je _get_AAD_done\@ + + READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 +@@ -640,7 +640,7 @@ _get_AAD_done\@: + .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ + AAD_HASH operation + mov PBlockLen(%arg2), %r13 +- cmp $0, %r13 ++ test %r13, %r13 + je _partial_block_done_\@ # Leave Macro if no partial blocks + # Read in input data without over reading + cmp $16, \PLAIN_CYPH_LEN +@@ -692,7 +692,7 @@ _no_extra_mask_1_\@: + pshufb %xmm2, %xmm3 + pxor %xmm3, \AAD_HASH + +- cmp $0, %r10 ++ test %r10, %r10 + jl _partial_incomplete_1_\@ + + # GHASH computation for the last <16 Byte block +@@ -727,7 +727,7 @@ _no_extra_mask_2_\@: + pshufb %xmm2, %xmm9 + pxor %xmm9, \AAD_HASH + +- cmp $0, %r10 ++ test %r10, %r10 + jl _partial_incomplete_2_\@ + + # GHASH computation for the last <16 Byte block +@@ -747,7 +747,7 @@ _encode_done_\@: + pshufb %xmm2, %xmm9 + .endif + # output encrypted Bytes +- cmp $0, %r10 ++ test %r10, %r10 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 +@@ -2720,7 +2720,7 @@ SYM_FUNC_END(aesni_ctr_enc) + */ + SYM_FUNC_START(aesni_xts_crypt8) + FRAME_BEGIN +- cmpb $0, %cl ++ testb %cl, %cl + movl $0, %ecx + movl $240, %r10d + leaq _aesni_enc4, %r11 +diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S +index 5fee47956f3b..2cf8e94d986a 100644 +--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S ++++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S +@@ -369,7 +369,7 @@ _initial_num_blocks_is_0\@: + + + _initial_blocks_encrypted\@: +- cmp $0, %r13 ++ test %r13, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 +@@ -528,7 +528,7 @@ _multiple_of_16_bytes\@: + vmovdqu HashKey(arg2), %xmm13 + + mov PBlockLen(arg2), %r12 +- cmp $0, %r12 ++ test %r12, %r12 + je _partial_done\@ + + #GHASH computation for the last <16 Byte block +@@ -573,7 +573,7 @@ _T_8\@: + add $8, %r10 + sub $8, %r11 + vpsrldq $8, %xmm9, %xmm9 +- cmp $0, %r11 ++ test %r11, %r11 + je _return_T_done\@ + _T_4\@: + vmovd %xmm9, %eax +@@ -581,7 +581,7 @@ _T_4\@: + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 +- cmp $0, %r11 ++ test %r11, %r11 + je _return_T_done\@ + _T_123\@: + vmovd %xmm9, %eax +@@ -625,7 +625,7 @@ _get_AAD_blocks\@: + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu \T8, \T7 +- cmp $0, %r11 ++ test %r11, %r11 + je _get_AAD_done\@ + + vpxor \T7, \T7, \T7 +@@ -644,7 +644,7 @@ _get_AAD_rest8\@: + vpxor \T1, \T7, \T7 + jmp _get_AAD_rest8\@ + _get_AAD_rest4\@: +- cmp $0, %r11 ++ test %r11, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 +@@ -749,7 +749,7 @@ _done_read_partial_block_\@: + .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ + AAD_HASH ENC_DEC + mov PBlockLen(arg2), %r13 +- cmp $0, %r13 ++ test %r13, %r13 + je _partial_block_done_\@ # Leave Macro if no partial blocks + # Read in input data without over reading + cmp $16, \PLAIN_CYPH_LEN +@@ -801,7 +801,7 @@ _no_extra_mask_1_\@: + vpshufb %xmm2, %xmm3, %xmm3 + vpxor %xmm3, \AAD_HASH, \AAD_HASH + +- cmp $0, %r10 ++ test %r10, %r10 + jl _partial_incomplete_1_\@ + + # GHASH computation for the last <16 Byte block +@@ -836,7 +836,7 @@ _no_extra_mask_2_\@: + vpshufb %xmm2, %xmm9, %xmm9 + vpxor %xmm9, \AAD_HASH, \AAD_HASH + +- cmp $0, %r10 ++ test %r10, %r10 + jl _partial_incomplete_2_\@ + + # GHASH computation for the last <16 Byte block +@@ -856,7 +856,7 @@ _encode_done_\@: + vpshufb %xmm2, %xmm9, %xmm9 + .endif + # output encrypted Bytes +- cmp $0, %r10 ++ test %r10, %r10 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 +-- +2.30.1 + diff --git a/queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch b/queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch new file mode 100644 index 00000000000..8310e7a82a5 --- /dev/null +++ b/queue-5.10/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch @@ -0,0 +1,278 @@ +From fcbbb7a6484bd5ae998197af4bb02dec2f495414 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 31 Dec 2020 17:41:54 +0100 +Subject: crypto: x86/aes-ni-xts - use direct calls to and 4-way stride + +From: Ard Biesheuvel + +[ Upstream commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 ] + +The XTS asm helper arrangement is a bit odd: the 8-way stride helper +consists of back-to-back calls to the 4-way core transforms, which +are called indirectly, based on a boolean that indicates whether we +are performing encryption or decryption. + +Given how costly indirect calls are on x86, let's switch to direct +calls, and given how the 8-way stride doesn't really add anything +substantial, use a 4-way stride instead, and make the asm core +routine deal with any multiple of 4 blocks. Since 512 byte sectors +or 4 KB blocks are the typical quantities XTS operates on, increase +the stride exported to the glue helper to 512 bytes as well. + +As a result, the number of indirect calls is reduced from 3 per 64 bytes +of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup +when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU) + +Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps") +Tested-by: Eric Biggers # x86_64 +Signed-off-by: Ard Biesheuvel +Signed-off-by: Herbert Xu +Signed-off-by: Sasha Levin +--- + arch/x86/crypto/aesni-intel_asm.S | 115 ++++++++++++++++++----------- + arch/x86/crypto/aesni-intel_glue.c | 25 ++++--- + 2 files changed, 84 insertions(+), 56 deletions(-) + +diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S +index d1436c37008b..57aef3f5a81e 100644 +--- a/arch/x86/crypto/aesni-intel_asm.S ++++ b/arch/x86/crypto/aesni-intel_asm.S +@@ -2715,25 +2715,18 @@ SYM_FUNC_END(aesni_ctr_enc) + pxor CTR, IV; + + /* +- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, +- * const u8 *src, bool enc, le128 *iv) ++ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, ++ * const u8 *src, unsigned int len, le128 *iv) + */ +-SYM_FUNC_START(aesni_xts_crypt8) ++SYM_FUNC_START(aesni_xts_encrypt) + FRAME_BEGIN +- testb %cl, %cl +- movl $0, %ecx +- movl $240, %r10d +- leaq _aesni_enc4, %r11 +- leaq _aesni_dec4, %rax +- cmovel %r10d, %ecx +- cmoveq %rax, %r11 + + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK + movups (IVP), IV + + mov 480(KEYP), KLEN +- addq %rcx, KEYP + ++.Lxts_enc_loop4: + movdqa IV, STATE1 + movdqu 0x00(INP), INC + pxor INC, STATE1 +@@ -2757,71 +2750,103 @@ SYM_FUNC_START(aesni_xts_crypt8) + pxor INC, STATE4 + movdqu IV, 0x30(OUTP) + +- CALL_NOSPEC r11 ++ call _aesni_enc4 + + movdqu 0x00(OUTP), INC + pxor INC, STATE1 + movdqu STATE1, 0x00(OUTP) + +- _aesni_gf128mul_x_ble() +- movdqa IV, STATE1 +- movdqu 0x40(INP), INC +- pxor INC, STATE1 +- movdqu IV, 0x40(OUTP) +- + movdqu 0x10(OUTP), INC + pxor INC, STATE2 + movdqu STATE2, 0x10(OUTP) + +- _aesni_gf128mul_x_ble() +- movdqa IV, STATE2 +- movdqu 0x50(INP), INC +- pxor INC, STATE2 +- movdqu IV, 0x50(OUTP) +- + movdqu 0x20(OUTP), INC + pxor INC, STATE3 + movdqu STATE3, 0x20(OUTP) + +- _aesni_gf128mul_x_ble() +- movdqa IV, STATE3 +- movdqu 0x60(INP), INC +- pxor INC, STATE3 +- movdqu IV, 0x60(OUTP) +- + movdqu 0x30(OUTP), INC + pxor INC, STATE4 + movdqu STATE4, 0x30(OUTP) + + _aesni_gf128mul_x_ble() +- movdqa IV, STATE4 +- movdqu 0x70(INP), INC +- pxor INC, STATE4 +- movdqu IV, 0x70(OUTP) + +- _aesni_gf128mul_x_ble() ++ add $64, INP ++ add $64, OUTP ++ sub $64, LEN ++ ja .Lxts_enc_loop4 ++ + movups IV, (IVP) + +- CALL_NOSPEC r11 ++ FRAME_END ++ ret ++SYM_FUNC_END(aesni_xts_encrypt) ++ ++/* ++ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, ++ * const u8 *src, unsigned int len, le128 *iv) ++ */ ++SYM_FUNC_START(aesni_xts_decrypt) ++ FRAME_BEGIN ++ ++ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK ++ movups (IVP), IV ++ ++ mov 480(KEYP), KLEN ++ add $240, KEYP + +- movdqu 0x40(OUTP), INC ++.Lxts_dec_loop4: ++ movdqa IV, STATE1 ++ movdqu 0x00(INP), INC + pxor INC, STATE1 +- movdqu STATE1, 0x40(OUTP) ++ movdqu IV, 0x00(OUTP) + +- movdqu 0x50(OUTP), INC ++ _aesni_gf128mul_x_ble() ++ movdqa IV, STATE2 ++ movdqu 0x10(INP), INC ++ pxor INC, STATE2 ++ movdqu IV, 0x10(OUTP) ++ ++ _aesni_gf128mul_x_ble() ++ movdqa IV, STATE3 ++ movdqu 0x20(INP), INC ++ pxor INC, STATE3 ++ movdqu IV, 0x20(OUTP) ++ ++ _aesni_gf128mul_x_ble() ++ movdqa IV, STATE4 ++ movdqu 0x30(INP), INC ++ pxor INC, STATE4 ++ movdqu IV, 0x30(OUTP) ++ ++ call _aesni_dec4 ++ ++ movdqu 0x00(OUTP), INC ++ pxor INC, STATE1 ++ movdqu STATE1, 0x00(OUTP) ++ ++ movdqu 0x10(OUTP), INC + pxor INC, STATE2 +- movdqu STATE2, 0x50(OUTP) ++ movdqu STATE2, 0x10(OUTP) + +- movdqu 0x60(OUTP), INC ++ movdqu 0x20(OUTP), INC + pxor INC, STATE3 +- movdqu STATE3, 0x60(OUTP) ++ movdqu STATE3, 0x20(OUTP) + +- movdqu 0x70(OUTP), INC ++ movdqu 0x30(OUTP), INC + pxor INC, STATE4 +- movdqu STATE4, 0x70(OUTP) ++ movdqu STATE4, 0x30(OUTP) ++ ++ _aesni_gf128mul_x_ble() ++ ++ add $64, INP ++ add $64, OUTP ++ sub $64, LEN ++ ja .Lxts_dec_loop4 ++ ++ movups IV, (IVP) + + FRAME_END + ret +-SYM_FUNC_END(aesni_xts_crypt8) ++SYM_FUNC_END(aesni_xts_decrypt) + + #endif +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index f9a1d98e7534..be891fdf8d17 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, + #define AVX_GEN2_OPTSIZE 640 + #define AVX_GEN4_OPTSIZE 4096 + ++asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, ++ const u8 *in, unsigned int len, u8 *iv); ++ ++asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, ++ const u8 *in, unsigned int len, u8 *iv); ++ + #ifdef CONFIG_X86_64 + + static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, +@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, + asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out, +- const u8 *in, bool enc, le128 *iv); +- + /* asmlinkage void aesni_gcm_enc() + * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * struct gcm_context_data. May be uninitialized. +@@ -547,14 +550,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) + glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); + } + +-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) ++static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv) + { +- aesni_xts_crypt8(ctx, dst, src, true, iv); ++ aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv); + } + +-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) ++static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv) + { +- aesni_xts_crypt8(ctx, dst, src, false, iv); ++ aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv); + } + + static const struct common_glue_ctx aesni_enc_xts = { +@@ -562,8 +565,8 @@ static const struct common_glue_ctx aesni_enc_xts = { + .fpu_blocks_limit = 1, + + .funcs = { { +- .num_blocks = 8, +- .fn_u = { .xts = aesni_xts_enc8 } ++ .num_blocks = 32, ++ .fn_u = { .xts = aesni_xts_enc32 } + }, { + .num_blocks = 1, + .fn_u = { .xts = aesni_xts_enc } +@@ -575,8 +578,8 @@ static const struct common_glue_ctx aesni_dec_xts = { + .fpu_blocks_limit = 1, + + .funcs = { { +- .num_blocks = 8, +- .fn_u = { .xts = aesni_xts_dec8 } ++ .num_blocks = 32, ++ .fn_u = { .xts = aesni_xts_dec32 } + }, { + .num_blocks = 1, + .fn_u = { .xts = aesni_xts_dec } +-- +2.30.1 + diff --git a/queue-5.10/series b/queue-5.10/series new file mode 100644 index 00000000000..80b512a47a2 --- /dev/null +++ b/queue-5.10/series @@ -0,0 +1,2 @@ +crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch +crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch