]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
lib/crypto: x86/blake2s: Use vpternlogd for 3-input XORs
authorEric Biggers <ebiggers@kernel.org>
Sun, 2 Nov 2025 23:42:09 +0000 (15:42 -0800)
committerEric Biggers <ebiggers@kernel.org>
Thu, 6 Nov 2025 04:30:52 +0000 (20:30 -0800)
AVX-512 supports 3-input XORs via the vpternlogd (or vpternlogq)
instruction with immediate 0x96.  This approach, vs. the alternative of
two vpxor instructions, is already used in the CRC, AES-GCM, and AES-XTS
code, since it reduces the instruction count and is faster on some CPUs.
Make blake2s_compress_avx512() take advantage of it too.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
lib/crypto/x86/blake2s-core.S

index 869064f6ac16e27602c3ec1035e06707655ccf02..7b1d98ca7482c10a2e02513c06c9752901eaf424 100644 (file)
@@ -278,10 +278,8 @@ SYM_FUNC_START(blake2s_compress_avx512)
        jne             .Lavx512_roundloop
 
        // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
-       vpxor           %xmm10,%xmm0,%xmm0
-       vpxor           %xmm11,%xmm1,%xmm1
-       vpxor           %xmm2,%xmm0,%xmm0
-       vpxor           %xmm3,%xmm1,%xmm1
+       vpternlogd      $0x96,%xmm10,%xmm2,%xmm0
+       vpternlogd      $0x96,%xmm11,%xmm3,%xmm1
        decq            NBLOCKS
        jne             .Lavx512_mainloop