From: Eric Biggers Date: Thu, 12 Dec 2024 21:28:39 +0000 (-0800) Subject: crypto: x86/aes-gcm - tune better for AMD CPUs X-Git-Tag: v6.14-rc1~112^2~39 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=95791ccd112eddd1f31a2c7040122365add52a3f;p=thirdparty%2Fkernel%2Fstable.git crypto: x86/aes-gcm - tune better for AMD CPUs Reorganize the main loop to free up the RNDKEYLAST[0-3] registers and use them for more cached round keys. This improves performance by about 2% on AMD Zen 4 and Zen 5. Intel performance remains about the same. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S index 8989bf9b8384d..02ee11083d4f8 100644 --- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -88,7 +88,7 @@ // A shuffle mask that reflects the bytes of 16-byte blocks .Lbswap_mask: - .octa 0x000102030405060708090a0b0c0d0e0f + .octa 0x000102030405060708090a0b0c0d0e0f // This is the GHASH reducing polynomial without its constant term, i.e. // x^128 + x^7 + x^2 + x, represented using the backwards mapping @@ -562,6 +562,32 @@ vpxord RNDKEY0, V3, V3 .endm +// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +// data with the resulting keystream, and write the result to DST and +// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) +.macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). + vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast GHASHDATA0, V0, GHASHDATA0 + vaesenclast GHASHDATA1, V1, GHASHDATA1 + vaesenclast GHASHDATA2, V2, GHASHDATA2 + vaesenclast GHASHDATA3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) +.endm + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, // const u32 le_ctr[4], u8 ghash_acc[16], // const u8 *src, u8 *dst, int datalen); @@ -640,7 +666,7 @@ // LE_CTR contains the next set of little-endian counter blocks. .set LE_CTR, V12 - // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. .set RNDKEY0, V13 @@ -650,15 +676,10 @@ .set RNDKEY_M7, V17 .set RNDKEY_M6, V18 .set RNDKEY_M5, V19 - - // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with - // the corresponding block of source data. This is useful because - // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can - // be computed in parallel with the AES rounds. - .set RNDKEYLAST0, V20 - .set RNDKEYLAST1, V21 - .set RNDKEYLAST2, V22 - .set RNDKEYLAST3, V23 + .set RNDKEY_M4, V20 + .set RNDKEY_M3, V21 + .set RNDKEY_M2, V22 + .set RNDKEY_M1, V23 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These // cannot coincide with anything used for AES encryption, since for @@ -748,18 +769,7 @@ add $16, %rax cmp %rax, RNDKEYLAST_PTR jne 1b - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) + _aesenclast_and_xor_4x sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 sub $-4*VL, DST add $-4*VL, DATALEN @@ -767,7 +777,7 @@ .endif // Cache as many additional AES round keys as possible. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i .endr @@ -799,47 +809,14 @@ _vaesenc_4x RNDKEY 128: - // XOR the source data with the last round key, saving the result in - // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the - // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). -.if \enc - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 -.else - vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 - vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 - vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 - vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 -.endif - // Finish the AES encryption of the counter blocks in V0-V3, interleaved // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) _vaesenc_4x RNDKEY_M\i - _ghash_step_4x (9 - \i) -.endr -.irp i, 4,3,2,1 - vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY - _vaesenc_4x RNDKEY - _ghash_step_4x (9 - \i) .endr _ghash_step_4x 9 - - // Do the last AES round. This handles the XOR with the source data - // too, as per the optimization described above. - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - - // Store the en/decrypted data to DST. - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) - + _aesenclast_and_xor_4x sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 sub $-4*VL, DST add $-4*VL, DATALEN @@ -940,7 +917,7 @@ // GHASH. However, any such blocks are all-zeroes, and the values that // they're multiplied with are also all-zeroes. Therefore they just add // 0 * 0 = 0 to the final GHASH result, which makes no difference. - vmovdqu8 (POWERS_PTR), H_POW1 + vmovdqu8 (POWERS_PTR), H_POW1 .if \enc vmovdqu8 V0, V1{%k1}{z} .endif