From: Eric Biggers <ebiggers@google.com>
Date: Thu, 12 Dec 2024 21:28:44 +0000 (-0800)
Subject: crypto: x86/aes-xts - more code size optimizations
X-Git-Tag: v6.14-rc1~112^2~34
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=68e95f5c6418ce1d0171fa756608a84170c56165;p=thirdparty%2Flinux.git

crypto: x86/aes-xts - more code size optimizations

Prefer immediates of -128 to 128, since the former fits in a signed
byte, saving 3 bytes per instruction.  Also prefer VEX-coded
instructions to EVEX where this is easy to do.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index c4e8ba6ed61df..0e6b9ae12e95e 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -188,6 +188,7 @@
 .endm
 
 // Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
@@ -208,11 +209,12 @@
 .endm
 
 // XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
-.if USE_AVX10
-	vpxord		\src1, \src2, \dst
-.else
+.if VL < 64
 	vpxor		\src1, \src2, \dst
+.else
+	vpxord		\src1, \src2, \dst
 .endif
 .endm
 
@@ -555,7 +557,7 @@
 	// Compute the first set of tweaks TWEAK[0-3].
 	_compute_first_set_of_tweaks
 
-	sub		$4*VL, LEN
+	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
 	jl		.Lhandle_remainder\@
 
 .Lmain_loop\@:
@@ -563,10 +565,10 @@
 
 	// XOR each source block with its tweak and the zero-th round key.
 .if USE_AVX10
-	vmovdqu8	0*VL(SRC), V0
-	vmovdqu8	1*VL(SRC), V1
-	vmovdqu8	2*VL(SRC), V2
-	vmovdqu8	3*VL(SRC), V3
+	_vmovdqu	0*VL(SRC), V0
+	_vmovdqu	1*VL(SRC), V1
+	_vmovdqu	2*VL(SRC), V2
+	_vmovdqu	3*VL(SRC), V3
 	vpternlogd	$0x96, TWEAK0, KEY0, V0
 	vpternlogd	$0x96, TWEAK1, KEY0, V1
 	vpternlogd	$0x96, TWEAK2, KEY0, V2
@@ -612,9 +614,9 @@
 	// Finish computing the next set of tweaks.
 	_tweak_step	1000
 
-	add		$4*VL, SRC
-	add		$4*VL, DST
-	sub		$4*VL, LEN
+	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+	sub		$-4*VL, DST
+	add		$-4*VL, LEN
 	jge		.Lmain_loop\@
 
 	// Check for the uncommon case where the data length isn't a multiple of