From: Eric Biggers Date: Sun, 2 Nov 2025 23:42:07 +0000 (-0800) Subject: lib/crypto: x86/blake2s: Improve readability X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=a7acd77ebd7f17b07a6ab2ca1dd1e4d487bdfa80;p=thirdparty%2Flinux.git lib/crypto: x86/blake2s: Improve readability Various cleanups for readability. No change to the generated code: - Add some comments - Add #defines for arguments - Rename some labels - Use decimal constants instead of hex where it makes sense. (The pshufd immediates intentionally remain as hex.) - Add blank lines when there's a logical break The round loop still could use some work, but this is at least a start. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251102234209.62133-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S index 14e487559c096..f805a49c590d5 100644 --- a/lib/crypto/x86/blake2s-core.S +++ b/lib/crypto/x86/blake2s-core.S @@ -50,34 +50,52 @@ .byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 .byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +#define CTX %rdi +#define DATA %rsi +#define NBLOCKS %rdx +#define INC %ecx + .text +// +// void blake2s_compress_ssse3(struct blake2s_ctx *ctx, +// const u8 *data, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2s_ctx are used: +// u32 h[8]; (inout) +// u32 t[2]; (inout) +// u32 f[2]; (in) +// SYM_FUNC_START(blake2s_compress_ssse3) - movdqu (%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 + movdqu (CTX),%xmm0 // Load h[0..3] + movdqu 16(CTX),%xmm1 // Load h[4..7] movdqa .Lror16(%rip),%xmm12 movdqa .Lror8(%rip),%xmm13 - movdqu 0x20(%rdi),%xmm14 - movd %ecx,%xmm15 - leaq .Lsigma+0xa0(%rip),%r8 - jmp .Lbeginofloop + movdqu 32(CTX),%xmm14 // Load t and f + movd INC,%xmm15 // Load inc + leaq .Lsigma+160(%rip),%r8 + jmp .Lssse3_mainloop + .align 32 -.Lbeginofloop: - movdqa %xmm0,%xmm10 - movdqa %xmm1,%xmm11 - paddq %xmm15,%xmm14 - movdqa .Liv(%rip),%xmm2 +.Lssse3_mainloop: + // Main loop: each iteration processes one 64-byte block. + movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3] + movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7] + paddq %xmm15,%xmm14 // t += inc (64-bit addition) + movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3] movdqa %xmm14,%xmm3 - pxor .Liv+0x10(%rip),%xmm3 + pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f] leaq .Lsigma(%rip),%rcx -.Lroundloop: + +.Lssse3_roundloop: + // Round loop: each iteration does 1 round (of 10 rounds total). movzbl (%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0x1(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x2(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x3(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 + movd (DATA,%rax,4),%xmm4 + movzbl 1(%rcx),%eax + movd (DATA,%rax,4),%xmm5 + movzbl 2(%rcx),%eax + movd (DATA,%rax,4),%xmm6 + movzbl 3(%rcx),%eax + movd (DATA,%rax,4),%xmm7 punpckldq %xmm5,%xmm4 punpckldq %xmm7,%xmm6 punpcklqdq %xmm6,%xmm4 @@ -88,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3) paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 + psrld $12,%xmm1 + pslld $20,%xmm8 por %xmm8,%xmm1 - movzbl 0x4(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x5(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x6(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0x7(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 + movzbl 4(%rcx),%eax + movd (DATA,%rax,4),%xmm5 + movzbl 5(%rcx),%eax + movd (DATA,%rax,4),%xmm6 + movzbl 6(%rcx),%eax + movd (DATA,%rax,4),%xmm7 + movzbl 7(%rcx),%eax + movd (DATA,%rax,4),%xmm4 punpckldq %xmm6,%xmm5 punpckldq %xmm4,%xmm7 punpcklqdq %xmm7,%xmm5 @@ -109,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3) paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 + psrld $7,%xmm1 + pslld $25,%xmm8 por %xmm8,%xmm1 pshufd $0x93,%xmm0,%xmm0 pshufd $0x4e,%xmm3,%xmm3 pshufd $0x39,%xmm2,%xmm2 - movzbl 0x8(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x9(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xa(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xb(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 + movzbl 8(%rcx),%eax + movd (DATA,%rax,4),%xmm6 + movzbl 9(%rcx),%eax + movd (DATA,%rax,4),%xmm7 + movzbl 10(%rcx),%eax + movd (DATA,%rax,4),%xmm4 + movzbl 11(%rcx),%eax + movd (DATA,%rax,4),%xmm5 punpckldq %xmm7,%xmm6 punpckldq %xmm5,%xmm4 punpcklqdq %xmm4,%xmm6 @@ -133,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3) paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 + psrld $12,%xmm1 + pslld $20,%xmm8 por %xmm8,%xmm1 - movzbl 0xc(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xd(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xe(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0xf(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 + movzbl 12(%rcx),%eax + movd (DATA,%rax,4),%xmm7 + movzbl 13(%rcx),%eax + movd (DATA,%rax,4),%xmm4 + movzbl 14(%rcx),%eax + movd (DATA,%rax,4),%xmm5 + movzbl 15(%rcx),%eax + movd (DATA,%rax,4),%xmm6 punpckldq %xmm4,%xmm7 punpckldq %xmm6,%xmm5 punpcklqdq %xmm5,%xmm7 @@ -154,52 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3) paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 + psrld $7,%xmm1 + pslld $25,%xmm8 por %xmm8,%xmm1 pshufd $0x39,%xmm0,%xmm0 pshufd $0x4e,%xmm3,%xmm3 pshufd $0x93,%xmm2,%xmm2 - addq $0x10,%rcx + addq $16,%rcx cmpq %r8,%rcx - jnz .Lroundloop + jnz .Lssse3_roundloop + + // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15] pxor %xmm2,%xmm0 pxor %xmm3,%xmm1 pxor %xmm10,%xmm0 pxor %xmm11,%xmm1 - addq $0x40,%rsi - decq %rdx - jnz .Lbeginofloop - movdqu %xmm0,(%rdi) - movdqu %xmm1,0x10(%rdi) - movdqu %xmm14,0x20(%rdi) + addq $64,DATA + decq NBLOCKS + jnz .Lssse3_mainloop + + movdqu %xmm0,(CTX) // Store new h[0..3] + movdqu %xmm1,16(CTX) // Store new h[4..7] + movdqu %xmm14,32(CTX) // Store new t and f RET SYM_FUNC_END(blake2s_compress_ssse3) +// +// void blake2s_compress_avx512(struct blake2s_ctx *ctx, +// const u8 *data, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2s_ctx are used: +// u32 h[8]; (inout) +// u32 t[2]; (inout) +// u32 f[2]; (in) +// SYM_FUNC_START(blake2s_compress_avx512) - vmovdqu (%rdi),%xmm0 - vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm4 - vmovd %ecx,%xmm5 - vmovdqa .Liv(%rip),%xmm14 - vmovdqa .Liv+16(%rip),%xmm15 - jmp .Lblake2s_compress_avx512_mainloop -.align 32 -.Lblake2s_compress_avx512_mainloop: - vmovdqa %xmm0,%xmm10 - vmovdqa %xmm1,%xmm11 - vpaddq %xmm5,%xmm4,%xmm4 - vmovdqa %xmm14,%xmm2 - vpxor %xmm15,%xmm4,%xmm3 - vmovdqu (%rsi),%ymm6 - vmovdqu 0x20(%rsi),%ymm7 - addq $0x40,%rsi + vmovdqu (CTX),%xmm0 // Load h[0..3] + vmovdqu 16(CTX),%xmm1 // Load h[4..7] + vmovdqu 32(CTX),%xmm4 // Load t and f + vmovd INC,%xmm5 // Load inc + vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3] + vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7] + jmp .Lavx512_mainloop + + .align 32 +.Lavx512_mainloop: + // Main loop: each iteration processes one 64-byte block. + vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3] + vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7] + vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition) + vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3] + vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f] + vmovdqu (DATA),%ymm6 // Load first 8 data words + vmovdqu 32(DATA),%ymm7 // Load second 8 data words + addq $64,DATA leaq .Lsigma2(%rip),%rax - movb $0xa,%cl -.Lblake2s_compress_avx512_roundloop: + movb $10,%cl // Set num rounds remaining + +.Lavx512_roundloop: + // Round loop: each iteration does 1 round (of 10 rounds total). vpmovzxbd (%rax),%ymm8 - vpmovzxbd 0x8(%rax),%ymm9 - addq $0x10,%rax + vpmovzxbd 8(%rax),%ymm9 + addq $16,%rax vpermi2d %ymm7,%ymm6,%ymm8 vpermi2d %ymm7,%ymm6,%ymm9 vmovdqa %ymm8,%ymm6 @@ -207,50 +241,53 @@ SYM_FUNC_START(blake2s_compress_avx512) vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 + vprord $16,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm8,%xmm8 + vprord $12,%xmm1,%xmm1 + vextracti128 $1,%ymm8,%xmm8 vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 + vprord $8,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 + vprord $7,%xmm1,%xmm1 vpshufd $0x93,%xmm0,%xmm0 vpshufd $0x4e,%xmm3,%xmm3 vpshufd $0x39,%xmm2,%xmm2 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 + vprord $16,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm9,%xmm9 + vprord $12,%xmm1,%xmm1 + vextracti128 $1,%ymm9,%xmm9 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 + vprord $8,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 + vprord $7,%xmm1,%xmm1 vpshufd $0x39,%xmm0,%xmm0 vpshufd $0x4e,%xmm3,%xmm3 vpshufd $0x93,%xmm2,%xmm2 decb %cl - jne .Lblake2s_compress_avx512_roundloop + jne .Lavx512_roundloop + + // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15] vpxor %xmm10,%xmm0,%xmm0 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm2,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 - decq %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm0,(%rdi) - vmovdqu %xmm1,0x10(%rdi) - vmovdqu %xmm4,0x20(%rdi) + decq NBLOCKS + jne .Lavx512_mainloop + + vmovdqu %xmm0,(CTX) // Store new h[0..3] + vmovdqu %xmm1,16(CTX) // Store new h[4..7] + vmovdqu %xmm4,32(CTX) // Store new t and f vzeroupper RET SYM_FUNC_END(blake2s_compress_avx512)