From 8ba60c5914f25a44f10189c6919a737b199f6dbf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 2 Nov 2025 15:42:09 -0800 Subject: [PATCH] lib/crypto: x86/blake2s: Use vpternlogd for 3-input XORs AVX-512 supports 3-input XORs via the vpternlogd (or vpternlogq) instruction with immediate 0x96. This approach, vs. the alternative of two vpxor instructions, is already used in the CRC, AES-GCM, and AES-XTS code, since it reduces the instruction count and is faster on some CPUs. Make blake2s_compress_avx512() take advantage of it too. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251102234209.62133-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/x86/blake2s-core.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S index 869064f6ac16e..7b1d98ca7482c 100644 --- a/lib/crypto/x86/blake2s-core.S +++ b/lib/crypto/x86/blake2s-core.S @@ -278,10 +278,8 @@ SYM_FUNC_START(blake2s_compress_avx512) jne .Lavx512_roundloop // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15] - vpxor %xmm10,%xmm0,%xmm0 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm2,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 + vpternlogd $0x96,%xmm10,%xmm2,%xmm0 + vpternlogd $0x96,%xmm11,%xmm3,%xmm1 decq NBLOCKS jne .Lavx512_mainloop -- 2.47.3