]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
lib/crypto: x86/blake2s: Improve readability
authorEric Biggers <ebiggers@kernel.org>
Sun, 2 Nov 2025 23:42:07 +0000 (15:42 -0800)
committerEric Biggers <ebiggers@kernel.org>
Thu, 6 Nov 2025 04:30:52 +0000 (20:30 -0800)
Various cleanups for readability.  No change to the generated code:

- Add some comments
- Add #defines for arguments
- Rename some labels
- Use decimal constants instead of hex where it makes sense.
  (The pshufd immediates intentionally remain as hex.)
- Add blank lines when there's a logical break

The round loop still could use some work, but this is at least a start.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
lib/crypto/x86/blake2s-core.S

index 14e487559c096c614025c1d53b46aafb2b352562..f805a49c590d5aadaf062f5744f6ac26a26d19d6 100644 (file)
 .byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 .byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 
+#define CTX            %rdi
+#define DATA           %rsi
+#define NBLOCKS                %rdx
+#define INC            %ecx
+
 .text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+//                            const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//     u32 h[8];       (inout)
+//     u32 t[2];       (inout)
+//     u32 f[2];       (in)
+//
 SYM_FUNC_START(blake2s_compress_ssse3)
-       movdqu          (%rdi),%xmm0
-       movdqu          0x10(%rdi),%xmm1
+       movdqu          (CTX),%xmm0             // Load h[0..3]
+       movdqu          16(CTX),%xmm1           // Load h[4..7]
        movdqa          .Lror16(%rip),%xmm12
        movdqa          .Lror8(%rip),%xmm13
-       movdqu          0x20(%rdi),%xmm14
-       movd            %ecx,%xmm15
-       leaq            .Lsigma+0xa0(%rip),%r8
-       jmp             .Lbeginofloop
+       movdqu          32(CTX),%xmm14          // Load t and f
+       movd            INC,%xmm15              // Load inc
+       leaq            .Lsigma+160(%rip),%r8
+       jmp             .Lssse3_mainloop
+
        .align          32
-.Lbeginofloop:
-       movdqa          %xmm0,%xmm10
-       movdqa          %xmm1,%xmm11
-       paddq           %xmm15,%xmm14
-       movdqa          .Liv(%rip),%xmm2
+.Lssse3_mainloop:
+       // Main loop: each iteration processes one 64-byte block.
+       movdqa          %xmm0,%xmm10            // Save h[0..3] and let v[0..3] = h[0..3]
+       movdqa          %xmm1,%xmm11            // Save h[4..7] and let v[4..7] = h[4..7]
+       paddq           %xmm15,%xmm14           // t += inc (64-bit addition)
+       movdqa          .Liv(%rip),%xmm2        // v[8..11] = iv[0..3]
        movdqa          %xmm14,%xmm3
-       pxor            .Liv+0x10(%rip),%xmm3
+       pxor            .Liv+16(%rip),%xmm3     // v[12..15] = iv[4..7] ^ [t, f]
        leaq            .Lsigma(%rip),%rcx
-.Lroundloop:
+
+.Lssse3_roundloop:
+       // Round loop: each iteration does 1 round (of 10 rounds total).
        movzbl          (%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm4
-       movzbl          0x1(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm5
-       movzbl          0x2(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm6
-       movzbl          0x3(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm7
+       movd            (DATA,%rax,4),%xmm4
+       movzbl          1(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm5
+       movzbl          2(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm6
+       movzbl          3(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm7
        punpckldq       %xmm5,%xmm4
        punpckldq       %xmm7,%xmm6
        punpcklqdq      %xmm6,%xmm4
@@ -88,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
        paddd           %xmm3,%xmm2
        pxor            %xmm2,%xmm1
        movdqa          %xmm1,%xmm8
-       psrld           $0xc,%xmm1
-       pslld           $0x14,%xmm8
+       psrld           $12,%xmm1
+       pslld           $20,%xmm8
        por             %xmm8,%xmm1
-       movzbl          0x4(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm5
-       movzbl          0x5(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm6
-       movzbl          0x6(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm7
-       movzbl          0x7(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm4
+       movzbl          4(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm5
+       movzbl          5(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm6
+       movzbl          6(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm7
+       movzbl          7(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm4
        punpckldq       %xmm6,%xmm5
        punpckldq       %xmm4,%xmm7
        punpcklqdq      %xmm7,%xmm5
@@ -109,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
        paddd           %xmm3,%xmm2
        pxor            %xmm2,%xmm1
        movdqa          %xmm1,%xmm8
-       psrld           $0x7,%xmm1
-       pslld           $0x19,%xmm8
+       psrld           $7,%xmm1
+       pslld           $25,%xmm8
        por             %xmm8,%xmm1
        pshufd          $0x93,%xmm0,%xmm0
        pshufd          $0x4e,%xmm3,%xmm3
        pshufd          $0x39,%xmm2,%xmm2
-       movzbl          0x8(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm6
-       movzbl          0x9(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm7
-       movzbl          0xa(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm4
-       movzbl          0xb(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm5
+       movzbl          8(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm6
+       movzbl          9(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm7
+       movzbl          10(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm4
+       movzbl          11(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm5
        punpckldq       %xmm7,%xmm6
        punpckldq       %xmm5,%xmm4
        punpcklqdq      %xmm4,%xmm6
@@ -133,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
        paddd           %xmm3,%xmm2
        pxor            %xmm2,%xmm1
        movdqa          %xmm1,%xmm8
-       psrld           $0xc,%xmm1
-       pslld           $0x14,%xmm8
+       psrld           $12,%xmm1
+       pslld           $20,%xmm8
        por             %xmm8,%xmm1
-       movzbl          0xc(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm7
-       movzbl          0xd(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm4
-       movzbl          0xe(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm5
-       movzbl          0xf(%rcx),%eax
-       movd            (%rsi,%rax,4),%xmm6
+       movzbl          12(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm7
+       movzbl          13(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm4
+       movzbl          14(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm5
+       movzbl          15(%rcx),%eax
+       movd            (DATA,%rax,4),%xmm6
        punpckldq       %xmm4,%xmm7
        punpckldq       %xmm6,%xmm5
        punpcklqdq      %xmm5,%xmm7
@@ -154,52 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
        paddd           %xmm3,%xmm2
        pxor            %xmm2,%xmm1
        movdqa          %xmm1,%xmm8
-       psrld           $0x7,%xmm1
-       pslld           $0x19,%xmm8
+       psrld           $7,%xmm1
+       pslld           $25,%xmm8
        por             %xmm8,%xmm1
        pshufd          $0x39,%xmm0,%xmm0
        pshufd          $0x4e,%xmm3,%xmm3
        pshufd          $0x93,%xmm2,%xmm2
-       addq            $0x10,%rcx
+       addq            $16,%rcx
        cmpq            %r8,%rcx
-       jnz             .Lroundloop
+       jnz             .Lssse3_roundloop
+
+       // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
        pxor            %xmm2,%xmm0
        pxor            %xmm3,%xmm1
        pxor            %xmm10,%xmm0
        pxor            %xmm11,%xmm1
-       addq            $0x40,%rsi
-       decq            %rdx
-       jnz             .Lbeginofloop
-       movdqu          %xmm0,(%rdi)
-       movdqu          %xmm1,0x10(%rdi)
-       movdqu          %xmm14,0x20(%rdi)
+       addq            $64,DATA
+       decq            NBLOCKS
+       jnz             .Lssse3_mainloop
+
+       movdqu          %xmm0,(CTX)             // Store new h[0..3]
+       movdqu          %xmm1,16(CTX)           // Store new h[4..7]
+       movdqu          %xmm14,32(CTX)          // Store new t and f
        RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+//                             const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//     u32 h[8];       (inout)
+//     u32 t[2];       (inout)
+//     u32 f[2];       (in)
+//
 SYM_FUNC_START(blake2s_compress_avx512)
-       vmovdqu         (%rdi),%xmm0
-       vmovdqu         0x10(%rdi),%xmm1
-       vmovdqu         0x20(%rdi),%xmm4
-       vmovd           %ecx,%xmm5
-       vmovdqa         .Liv(%rip),%xmm14
-       vmovdqa         .Liv+16(%rip),%xmm15
-       jmp             .Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
-       vmovdqa         %xmm0,%xmm10
-       vmovdqa         %xmm1,%xmm11
-       vpaddq          %xmm5,%xmm4,%xmm4
-       vmovdqa         %xmm14,%xmm2
-       vpxor           %xmm15,%xmm4,%xmm3
-       vmovdqu         (%rsi),%ymm6
-       vmovdqu         0x20(%rsi),%ymm7
-       addq            $0x40,%rsi
+       vmovdqu         (CTX),%xmm0             // Load h[0..3]
+       vmovdqu         16(CTX),%xmm1           // Load h[4..7]
+       vmovdqu         32(CTX),%xmm4           // Load t and f
+       vmovd           INC,%xmm5               // Load inc
+       vmovdqa         .Liv(%rip),%xmm14       // Load iv[0..3]
+       vmovdqa         .Liv+16(%rip),%xmm15    // Load iv[4..7]
+       jmp             .Lavx512_mainloop
+
+       .align          32
+.Lavx512_mainloop:
+       // Main loop: each iteration processes one 64-byte block.
+       vmovdqa         %xmm0,%xmm10            // Save h[0..3] and let v[0..3] = h[0..3]
+       vmovdqa         %xmm1,%xmm11            // Save h[4..7] and let v[4..7] = h[4..7]
+       vpaddq          %xmm5,%xmm4,%xmm4       // t += inc (64-bit addition)
+       vmovdqa         %xmm14,%xmm2            // v[8..11] = iv[0..3]
+       vpxor           %xmm15,%xmm4,%xmm3      // v[12..15] = iv[4..7] ^ [t, f]
+       vmovdqu         (DATA),%ymm6            // Load first 8 data words
+       vmovdqu         32(DATA),%ymm7          // Load second 8 data words
+       addq            $64,DATA
        leaq            .Lsigma2(%rip),%rax
-       movb            $0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
+       movb            $10,%cl                 // Set num rounds remaining
+
+.Lavx512_roundloop:
+       // Round loop: each iteration does 1 round (of 10 rounds total).
        vpmovzxbd       (%rax),%ymm8
-       vpmovzxbd       0x8(%rax),%ymm9
-       addq            $0x10,%rax
+       vpmovzxbd       8(%rax),%ymm9
+       addq            $16,%rax
        vpermi2d        %ymm7,%ymm6,%ymm8
        vpermi2d        %ymm7,%ymm6,%ymm9
        vmovdqa         %ymm8,%ymm6
@@ -207,50 +241,53 @@ SYM_FUNC_START(blake2s_compress_avx512)
        vpaddd          %xmm8,%xmm0,%xmm0
        vpaddd          %xmm1,%xmm0,%xmm0
        vpxor           %xmm0,%xmm3,%xmm3
-       vprord          $0x10,%xmm3,%xmm3
+       vprord          $16,%xmm3,%xmm3
        vpaddd          %xmm3,%xmm2,%xmm2
        vpxor           %xmm2,%xmm1,%xmm1
-       vprord          $0xc,%xmm1,%xmm1
-       vextracti128    $0x1,%ymm8,%xmm8
+       vprord          $12,%xmm1,%xmm1
+       vextracti128    $1,%ymm8,%xmm8
        vpaddd          %xmm8,%xmm0,%xmm0
        vpaddd          %xmm1,%xmm0,%xmm0
        vpxor           %xmm0,%xmm3,%xmm3
-       vprord          $0x8,%xmm3,%xmm3
+       vprord          $8,%xmm3,%xmm3
        vpaddd          %xmm3,%xmm2,%xmm2
        vpxor           %xmm2,%xmm1,%xmm1
-       vprord          $0x7,%xmm1,%xmm1
+       vprord          $7,%xmm1,%xmm1
        vpshufd         $0x93,%xmm0,%xmm0
        vpshufd         $0x4e,%xmm3,%xmm3
        vpshufd         $0x39,%xmm2,%xmm2
        vpaddd          %xmm9,%xmm0,%xmm0
        vpaddd          %xmm1,%xmm0,%xmm0
        vpxor           %xmm0,%xmm3,%xmm3
-       vprord          $0x10,%xmm3,%xmm3
+       vprord          $16,%xmm3,%xmm3
        vpaddd          %xmm3,%xmm2,%xmm2
        vpxor           %xmm2,%xmm1,%xmm1
-       vprord          $0xc,%xmm1,%xmm1
-       vextracti128    $0x1,%ymm9,%xmm9
+       vprord          $12,%xmm1,%xmm1
+       vextracti128    $1,%ymm9,%xmm9
        vpaddd          %xmm9,%xmm0,%xmm0
        vpaddd          %xmm1,%xmm0,%xmm0
        vpxor           %xmm0,%xmm3,%xmm3
-       vprord          $0x8,%xmm3,%xmm3
+       vprord          $8,%xmm3,%xmm3
        vpaddd          %xmm3,%xmm2,%xmm2
        vpxor           %xmm2,%xmm1,%xmm1
-       vprord          $0x7,%xmm1,%xmm1
+       vprord          $7,%xmm1,%xmm1
        vpshufd         $0x39,%xmm0,%xmm0
        vpshufd         $0x4e,%xmm3,%xmm3
        vpshufd         $0x93,%xmm2,%xmm2
        decb            %cl
-       jne             .Lblake2s_compress_avx512_roundloop
+       jne             .Lavx512_roundloop
+
+       // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
        vpxor           %xmm10,%xmm0,%xmm0
        vpxor           %xmm11,%xmm1,%xmm1
        vpxor           %xmm2,%xmm0,%xmm0
        vpxor           %xmm3,%xmm1,%xmm1
-       decq            %rdx
-       jne             .Lblake2s_compress_avx512_mainloop
-       vmovdqu         %xmm0,(%rdi)
-       vmovdqu         %xmm1,0x10(%rdi)
-       vmovdqu         %xmm4,0x20(%rdi)
+       decq            NBLOCKS
+       jne             .Lavx512_mainloop
+
+       vmovdqu         %xmm0,(CTX)             // Store new h[0..3]
+       vmovdqu         %xmm1,16(CTX)           // Store new h[4..7]
+       vmovdqu         %xmm4,32(CTX)           // Store new t and f
        vzeroupper
        RET
 SYM_FUNC_END(blake2s_compress_avx512)