]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
crypto: x86/aes-xts - more code size optimizations
authorEric Biggers <ebiggers@google.com>
Thu, 12 Dec 2024 21:28:44 +0000 (13:28 -0800)
committerHerbert Xu <herbert@gondor.apana.org.au>
Sat, 21 Dec 2024 14:46:24 +0000 (22:46 +0800)
Prefer immediates of -128 to 128, since the former fits in a signed
byte, saving 3 bytes per instruction.  Also prefer VEX-coded
instructions to EVEX where this is easy to do.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/aes-xts-avx-x86_64.S

index c4e8ba6ed61df4b53b8b8ba16da1a890441cb629..0e6b9ae12e95eb73f70bfd2aa8ab98a70067329d 100644 (file)
 .endm
 
 // Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
 .macro _vmovdqu        src, dst
 .if VL < 64
        vmovdqu         \src, \dst
 .endm
 
 // XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
 .macro _vpxor  src1, src2, dst
-.if USE_AVX10
-       vpxord          \src1, \src2, \dst
-.else
+.if VL < 64
        vpxor           \src1, \src2, \dst
+.else
+       vpxord          \src1, \src2, \dst
 .endif
 .endm
 
        // Compute the first set of tweaks TWEAK[0-3].
        _compute_first_set_of_tweaks
 
-       sub             $4*VL, LEN
+       add             $-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
        jl              .Lhandle_remainder\@
 
 .Lmain_loop\@:
 
        // XOR each source block with its tweak and the zero-th round key.
 .if USE_AVX10
-       vmovdqu8        0*VL(SRC), V0
-       vmovdqu8        1*VL(SRC), V1
-       vmovdqu8        2*VL(SRC), V2
-       vmovdqu8        3*VL(SRC), V3
+       _vmovdqu        0*VL(SRC), V0
+       _vmovdqu        1*VL(SRC), V1
+       _vmovdqu        2*VL(SRC), V2
+       _vmovdqu        3*VL(SRC), V3
        vpternlogd      $0x96, TWEAK0, KEY0, V0
        vpternlogd      $0x96, TWEAK1, KEY0, V1
        vpternlogd      $0x96, TWEAK2, KEY0, V2
        // Finish computing the next set of tweaks.
        _tweak_step     1000
 
-       add             $4*VL, SRC
-       add             $4*VL, DST
-       sub             $4*VL, LEN
+       sub             $-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+       sub             $-4*VL, DST
+       add             $-4*VL, LEN
        jge             .Lmain_loop\@
 
        // Check for the uncommon case where the data length isn't a multiple of