From: Eric Biggers Date: Thu, 12 Dec 2024 21:28:44 +0000 (-0800) Subject: crypto: x86/aes-xts - more code size optimizations X-Git-Tag: v6.14-rc1~112^2~34 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=68e95f5c6418ce1d0171fa756608a84170c56165;p=thirdparty%2Flinux.git crypto: x86/aes-xts - more code size optimizations Prefer immediates of -128 to 128, since the former fits in a signed byte, saving 3 bytes per instruction. Also prefer VEX-coded instructions to EVEX where this is easy to do. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index c4e8ba6ed61df..0e6b9ae12e95e 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -188,6 +188,7 @@ .endm // Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -208,11 +209,12 @@ .endm // XOR two vectors together. +// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst -.if USE_AVX10 - vpxord \src1, \src2, \dst -.else +.if VL < 64 vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst .endif .endm @@ -555,7 +557,7 @@ // Compute the first set of tweaks TWEAK[0-3]. _compute_first_set_of_tweaks - sub $4*VL, LEN + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 jl .Lhandle_remainder\@ .Lmain_loop\@: @@ -563,10 +565,10 @@ // XOR each source block with its tweak and the zero-th round key. .if USE_AVX10 - vmovdqu8 0*VL(SRC), V0 - vmovdqu8 1*VL(SRC), V1 - vmovdqu8 2*VL(SRC), V2 - vmovdqu8 3*VL(SRC), V3 + _vmovdqu 0*VL(SRC), V0 + _vmovdqu 1*VL(SRC), V1 + _vmovdqu 2*VL(SRC), V2 + _vmovdqu 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 @@ -612,9 +614,9 @@ // Finish computing the next set of tweaks. _tweak_step 1000 - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, LEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN jge .Lmain_loop\@ // Check for the uncommon case where the data length isn't a multiple of