crypto: x86/aes-gcm - tune better for AMD CPUs

author Eric Biggers <ebiggers@google.com>

Thu, 12 Dec 2024 21:28:39 +0000 (13:28 -0800)

committer Herbert Xu <herbert@gondor.apana.org.au>

Sat, 21 Dec 2024 14:46:24 +0000 (22:46 +0800)
author Eric Biggers <ebiggers@google.com>
Thu, 12 Dec 2024 21:28:39 +0000 (13:28 -0800)
committer Herbert Xu <herbert@gondor.apana.org.au>
Sat, 21 Dec 2024 14:46:24 +0000 (22:46 +0800)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S

index 8989bf9b8384ddc7911b5f97a50bda65b25e944e..02ee11083d4f8faff86722c9469053fa47b679c3 100644 (file)
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -88,7 +88,7 @@
  
         // A shuffle mask that reflects the bytes of 16-byte blocks
  .Lbswap_mask:
-       .octa   0x000102030405060708090a0b0c0d0e0f
+       .octa   0x000102030405060708090a0b0c0d0e0f
  
         // This is the GHASH reducing polynomial without its constant term, i.e.
         // x^128 + x^7 + x^2 + x, represented using the backwards mapping
@@ -562,6 +562,32 @@
         vpxord          RNDKEY0, V3, V3
  .endm
  
+// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+// data with the resulting keystream, and write the result to DST and
+// GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
+.macro _aesenclast_and_xor_4x
+       // XOR the source data with the last round key, saving the result in
+       // GHASHDATA[0-3].  This reduces latency by taking advantage of the
+       // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+       vpxord          0*VL(SRC), RNDKEYLAST, GHASHDATA0
+       vpxord          1*VL(SRC), RNDKEYLAST, GHASHDATA1
+       vpxord          2*VL(SRC), RNDKEYLAST, GHASHDATA2
+       vpxord          3*VL(SRC), RNDKEYLAST, GHASHDATA3
+
+       // Do the last AES round.  This handles the XOR with the source data
+       // too, as per the optimization described above.
+       vaesenclast     GHASHDATA0, V0, GHASHDATA0
+       vaesenclast     GHASHDATA1, V1, GHASHDATA1
+       vaesenclast     GHASHDATA2, V2, GHASHDATA2
+       vaesenclast     GHASHDATA3, V3, GHASHDATA3
+
+       // Store the en/decrypted data to DST.
+       vmovdqu8        GHASHDATA0, 0*VL(DST)
+       vmovdqu8        GHASHDATA1, 1*VL(DST)
+       vmovdqu8        GHASHDATA2, 2*VL(DST)
+       vmovdqu8        GHASHDATA3, 3*VL(DST)
+.endm
+
  // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
  //                                       const u32 le_ctr[4], u8 ghash_acc[16],
  //                                       const u8 *src, u8 *dst, int datalen);
@@ -640,7 +666,7 @@
         // LE_CTR contains the next set of little-endian counter blocks.
         .set    LE_CTR,         V12
  
-       // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+       // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
         // copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
         // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
         .set    RNDKEY0,        V13
@@ -650,15 +676,10 @@
         .set    RNDKEY_M7,      V17
         .set    RNDKEY_M6,      V18
         .set    RNDKEY_M5,      V19
-
-       // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
-       // the corresponding block of source data.  This is useful because
-       // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
-       // be computed in parallel with the AES rounds.
-       .set    RNDKEYLAST0,    V20
-       .set    RNDKEYLAST1,    V21
-       .set    RNDKEYLAST2,    V22
-       .set    RNDKEYLAST3,    V23
+       .set    RNDKEY_M4,      V20
+       .set    RNDKEY_M3,      V21
+       .set    RNDKEY_M2,      V22
+       .set    RNDKEY_M1,      V23
  
         // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
         // cannot coincide with anything used for AES encryption, since for
@@ -748,18 +769,7 @@
         add             $16, %rax
         cmp             %rax, RNDKEYLAST_PTR
         jne             1b
-       vpxord          0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
-       vpxord          1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
-       vpxord          2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
-       vpxord          3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-       vaesenclast     RNDKEYLAST0, V0, GHASHDATA0
-       vaesenclast     RNDKEYLAST1, V1, GHASHDATA1
-       vaesenclast     RNDKEYLAST2, V2, GHASHDATA2
-       vaesenclast     RNDKEYLAST3, V3, GHASHDATA3
-       vmovdqu8        GHASHDATA0, 0*VL(DST)
-       vmovdqu8        GHASHDATA1, 1*VL(DST)
-       vmovdqu8        GHASHDATA2, 2*VL(DST)
-       vmovdqu8        GHASHDATA3, 3*VL(DST)
+       _aesenclast_and_xor_4x
         sub             $-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
         sub             $-4*VL, DST
         add             $-4*VL, DATALEN
@@ -767,7 +777,7 @@
  .endif
  
         // Cache as many additional AES round keys as possible.
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
         vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
  .endr
  
@@ -799,47 +809,14 @@
         _vaesenc_4x     RNDKEY
  128:
  
-       // XOR the source data with the last round key, saving the result in
-       // RNDKEYLAST[0-3].  This reduces latency by taking advantage of the
-       // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-.if \enc
-       vpxord          0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
-       vpxord          1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
-       vpxord          2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
-       vpxord          3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-.else
-       vpxord          GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
-       vpxord          GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
-       vpxord          GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
-       vpxord          GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
-.endif
-
         // Finish the AES encryption of the counter blocks in V0-V3, interleaved
         // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
+       _ghash_step_4x  (9 - \i)
         _vaesenc_4x     RNDKEY_M\i
-       _ghash_step_4x  (9 - \i)
-.endr
-.irp i, 4,3,2,1
-       vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
-       _vaesenc_4x     RNDKEY
-       _ghash_step_4x  (9 - \i)
  .endr
         _ghash_step_4x  9
-
-       // Do the last AES round.  This handles the XOR with the source data
-       // too, as per the optimization described above.
-       vaesenclast     RNDKEYLAST0, V0, GHASHDATA0
-       vaesenclast     RNDKEYLAST1, V1, GHASHDATA1
-       vaesenclast     RNDKEYLAST2, V2, GHASHDATA2
-       vaesenclast     RNDKEYLAST3, V3, GHASHDATA3
-
-       // Store the en/decrypted data to DST.
-       vmovdqu8        GHASHDATA0, 0*VL(DST)
-       vmovdqu8        GHASHDATA1, 1*VL(DST)
-       vmovdqu8        GHASHDATA2, 2*VL(DST)
-       vmovdqu8        GHASHDATA3, 3*VL(DST)
-
+       _aesenclast_and_xor_4x
         sub             $-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
         sub             $-4*VL, DST
         add             $-4*VL, DATALEN
@@ -940,7 +917,7 @@
         // GHASH.  However, any such blocks are all-zeroes, and the values that
         // they're multiplied with are also all-zeroes.  Therefore they just add
         // 0 * 0 = 0 to the final GHASH result, which makes no difference.
-       vmovdqu8        (POWERS_PTR), H_POW1
+       vmovdqu8        (POWERS_PTR), H_POW1
  .if \enc
         vmovdqu8        V0, V1{%k1}{z}
  .endif
author	Eric Biggers <ebiggers@google.com>
	Thu, 12 Dec 2024 21:28:39 +0000 (13:28 -0800)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Sat, 21 Dec 2024 14:46:24 +0000 (22:46 +0800)