/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
-// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
//
// Copyright 2024 Google LLC
//
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-//
-// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
-// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
-// either AVX512 or AVX10. Some of the functions, notably the encryption and
-// decryption update functions which are the most performance-critical, are
-// provided in two variants generated from a macro: one using 256-bit vectors
-// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The
-// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
-//
-// The functions that use 512-bit vectors are intended for CPUs that support
-// 512-bit vectors *and* where using them doesn't cause significant
-// downclocking. They require the following CPU features:
-//
-// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
-//
-// The other functions require the following CPU features:
-//
-// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
-//
-// All functions use the "System V" ABI. The Windows ABI is not supported.
-//
-// Note that we use "avx10" in the names of the functions as a shorthand to
-// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's
-// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
-// to be a simple way to name things that makes sense on all CPUs.
-//
-// Note that the macros that support both 256-bit and 512-bit vectors could
-// fairly easily be changed to support 128-bit too. However, this would *not*
-// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
-// because the code heavily uses several features of these extensions other than
-// the vector length: the increase in the number of SIMD registers from 16 to
-// 32, masking support, and new instructions such as vpternlogd (which can do a
-// three-argument XOR). These features are very useful for AES-GCM.
#include <linux/linkage.h>
vpternlogd $0x96, \t0, \mi, \hi
.endm
-// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
+// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
//
// Given the expanded AES key |key->aes_key|, this function derives the GHASH
// subkey and initializes |key->ghash_key_powers| with powers of it.
vmovdqu8 GHASHDATA3, 3*VL(DST)
.endm
-// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
-// const u32 le_ctr[4], u8 ghash_acc[16],
-// const u8 *src, u8 *dst, int datalen);
+// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
//
// This macro generates a GCM encryption or decryption update function with the
// above prototype (with \enc selecting which one). This macro supports both
RET
.endm
-// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-// const u32 le_ctr[4], u8 ghash_acc[16],
-// u64 total_aadlen, u64 total_datalen);
-// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-// const u32 le_ctr[4],
-// const u8 ghash_acc[16],
-// u64 total_aadlen, u64 total_datalen,
-// const u8 tag[16], int taglen);
+// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4],
+// const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
//
// This macro generates one of the above two functions (with \enc selecting
// which one). Both functions finish computing the GCM authentication tag by
.endm
_set_veclen 64
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
_aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
_aes_gcm_update 1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
_aes_gcm_update 0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
-// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-// u8 ghash_acc[16],
-// const u8 *aad, int aadlen);
+// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
//
// This function processes the AAD (Additional Authenticated Data) in GCM.
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
// provide a version using 512-bit vectors, but that doesn't seem to be useful.
-SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
// Function arguments
.set KEY, %rdi
vzeroupper // This is needed after using ymm or zmm registers.
RET
-SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
-SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
_aes_gcm_final 1
-SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
-SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)
_aes_gcm_final 0
-SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
#define AES_GCM_KEY_VAES_AVX2_SIZE \
(sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1)))
-/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
-struct aes_gcm_key_avx10 {
+/* Key struct used by the VAES + AVX512 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx512 {
/*
* Common part of the key. The assembly code prefers 16-byte alignment
* for the round keys; we get this by them being located at the start of
/* Three padding blocks required by the assembly code */
u64 padding[3][2];
};
-#define AES_GCM_KEY_AVX10(key) \
- container_of((key), struct aes_gcm_key_avx10, base)
-#define AES_GCM_KEY_AVX10_SIZE \
- (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+#define AES_GCM_KEY_VAES_AVX512(key) \
+ container_of((key), struct aes_gcm_key_vaes_avx512, base)
+#define AES_GCM_KEY_VAES_AVX512_SIZE \
+ (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1)))
/*
* These flags are passed to the AES-GCM helper functions to specify the
#define FLAG_ENC BIT(1)
#define FLAG_AVX BIT(2)
#define FLAG_VAES_AVX2 BIT(3)
-#define FLAG_AVX10_512 BIT(4)
+#define FLAG_VAES_AVX512 BIT(4)
static inline struct aes_gcm_key *
aes_gcm_key_get(struct crypto_aead *tfm, int flags)
{
- if (flags & FLAG_AVX10_512)
+ if (flags & FLAG_VAES_AVX512)
return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
else if (flags & FLAG_VAES_AVX2)
return PTR_ALIGN(crypto_aead_ctx(tfm), 32);
asmlinkage void
aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
asmlinkage void
-aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
{
- if (flags & FLAG_AVX10_512)
- aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key));
else if (flags & FLAG_VAES_AVX2)
aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key));
else if (flags & FLAG_AVX)
aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
u8 ghash_acc[16], const u8 *aad, int aadlen);
asmlinkage void
-aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
- u8 ghash_acc[16], const u8 *aad, int aadlen);
+aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
const u8 *aad, int aadlen, int flags)
{
- if (flags & FLAG_AVX10_512)
- aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
- aad, aadlen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ ghash_acc, aad, aadlen);
else if (flags & FLAG_VAES_AVX2)
aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
ghash_acc, aad, aadlen);
const u32 le_ctr[4], u8 ghash_acc[16],
const u8 *src, u8 *dst, int datalen);
asmlinkage void
-aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
const u32 le_ctr[4], u8 ghash_acc[16],
const u8 *src, u8 *dst, int datalen);
asmlinkage void
-aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
/* __always_inline to optimize out the branches based on @flags */
static __always_inline void
const u8 *src, u8 *dst, int datalen, int flags)
{
if (flags & FLAG_ENC) {
- if (flags & FLAG_AVX10_512)
- aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- src, dst, datalen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
else if (flags & FLAG_VAES_AVX2)
aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
le_ctr, ghash_acc,
aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
ghash_acc, src, dst, datalen);
} else {
- if (flags & FLAG_AVX10_512)
- aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- src, dst, datalen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
else if (flags & FLAG_VAES_AVX2)
aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
le_ctr, ghash_acc,
const u32 le_ctr[4], u8 ghash_acc[16],
u64 total_aadlen, u64 total_datalen);
asmlinkage void
-aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- u64 total_aadlen, u64 total_datalen);
+aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
/* __always_inline to optimize out the branches based on @flags */
static __always_inline void
const u32 le_ctr[4], u8 ghash_acc[16],
u64 total_aadlen, u64 total_datalen, int flags)
{
- if (flags & FLAG_AVX10_512)
- aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- total_aadlen, total_datalen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
else if (flags & FLAG_VAES_AVX2)
aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
le_ctr, ghash_acc,
u64 total_aadlen, u64 total_datalen,
const u8 tag[16], int taglen);
asmlinkage bool __must_check
-aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], const u8 ghash_acc[16],
- u64 total_aadlen, u64 total_datalen,
- const u8 tag[16], int taglen);
+aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
/* __always_inline to optimize out the branches based on @flags */
static __always_inline bool __must_check
u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
u8 tag[16], int taglen, int flags)
{
- if (flags & FLAG_AVX10_512)
- return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- total_aadlen, total_datalen,
- tag, taglen);
+ if (flags & FLAG_VAES_AVX512)
+ return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
else if (flags & FLAG_VAES_AVX2)
return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
le_ctr, ghash_acc,
BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480);
BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512);
BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768);
if (likely(crypto_simd_usable())) {
err = aes_check_keylen(keylen);
gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
/* Compute the needed key powers */
- if (flags & FLAG_AVX10_512) {
- struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+ if (flags & FLAG_VAES_AVX512) {
+ struct aes_gcm_key_vaes_avx512 *k =
+ AES_GCM_KEY_VAES_AVX512(key);
for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
k->h_powers[i][0] = be64_to_cpu(h.b);
"generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2",
AES_GCM_KEY_VAES_AVX2_SIZE, 600);
-/* aes_gcm_algs_vaes_avx10_512 */
-DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
- "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
- AES_GCM_KEY_AVX10_SIZE, 800);
+/* aes_gcm_algs_vaes_avx512 */
+DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512,
+ "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512",
+ AES_GCM_KEY_VAES_AVX512_SIZE, 800);
static int __init register_avx_algs(void)
{
for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
- for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
- aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++)
+ aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1;
}
err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
ARRAY_SIZE(skcipher_algs_vaes_avx512));
if (err)
return err;
- err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx512));
if (err)
return err;
unregister_skciphers(skcipher_algs_vaes_avx2);
unregister_skciphers(skcipher_algs_vaes_avx512);
unregister_aeads(aes_gcm_algs_vaes_avx2);
- unregister_aeads(aes_gcm_algs_vaes_avx10_512);
+ unregister_aeads(aes_gcm_algs_vaes_avx512);
}
#else /* CONFIG_X86_64 */
static struct aead_alg aes_gcm_algs_aesni[0];