]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
crypto: arm64/ghash - Move NEON GHASH assembly into its own file
authorEric Biggers <ebiggers@kernel.org>
Thu, 19 Mar 2026 06:17:09 +0000 (23:17 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 23 Mar 2026 22:24:59 +0000 (15:24 -0700)
arch/arm64/crypto/ghash-ce-core.S implements pmull_ghash_update_p8(),
which is used only by a crypto_shash implementation of GHASH.  It also
implements other functions, including pmull_ghash_update_p64() and
others, which are used only by a crypto_aead implementation of AES-GCM.

While some code is shared between pmull_ghash_update_p8() and
pmull_ghash_update_p64(), it's not very much.  Since
pmull_ghash_update_p8() will also need to be migrated into lib/crypto/
to achieve parity in the standalone GHASH support, let's move it into a
separate file ghash-neon-core.S.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260319061723.1140720-9-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
arch/arm64/crypto/Makefile
arch/arm64/crypto/ghash-ce-core.S
arch/arm64/crypto/ghash-neon-core.S [new file with mode: 0644]

index 8a8e3e551ed332056abc1c67a4be405ad7a5dd3c..b7ba43ce8584c3c9de0dcc033957f07593b16519 100644 (file)
@@ -27,7 +27,7 @@ obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
 sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
 
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
-ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o ghash-neon-core.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
 aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
index 23ee9a5eaf27c23c5b30ead46d7761e1909cd4a5..4344fe213d14c7af971d730f6be9a2f7cc43d7a9 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Accelerated GHASH implementation with ARMv8 PMULL instructions.
+ * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
  *
  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  */
        XH              .req    v7
        IN1             .req    v7
 
-       k00_16          .req    v8
-       k32_48          .req    v9
-
-       t3              .req    v10
-       t4              .req    v11
-       t5              .req    v12
-       t6              .req    v13
-       t7              .req    v14
-       t8              .req    v15
-       t9              .req    v16
-
-       perm1           .req    v17
-       perm2           .req    v18
-       perm3           .req    v19
-
-       sh1             .req    v20
-       sh2             .req    v21
-       sh3             .req    v22
-       sh4             .req    v23
-
-       ss1             .req    v24
-       ss2             .req    v25
-       ss3             .req    v26
-       ss4             .req    v27
-
        XL2             .req    v8
        XM2             .req    v9
        XH2             .req    v10
        .text
        .arch           armv8-a+crypto
 
-       .macro          __pmull_p64, rd, rn, rm
-       pmull           \rd\().1q, \rn\().1d, \rm\().1d
-       .endm
-
-       .macro          __pmull2_p64, rd, rn, rm
-       pmull2          \rd\().1q, \rn\().2d, \rm\().2d
-       .endm
-
-       .macro          __pmull_p8, rq, ad, bd
-       ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
-       ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
-       ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
-
-       __pmull_p8_\bd  \rq, \ad
-       .endm
-
-       .macro          __pmull2_p8, rq, ad, bd
-       tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
-       tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
-       tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
-
-       __pmull2_p8_\bd \rq, \ad
-       .endm
-
-       .macro          __pmull_p8_SHASH, rq, ad
-       __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
-       .endm
-
-       .macro          __pmull_p8_SHASH2, rq, ad
-       __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
-       .endm
-
-       .macro          __pmull2_p8_SHASH, rq, ad
-       __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
-       .endm
-
-       .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
-       pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
-       pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
-       pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
-       pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
-       pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
-       pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
-       pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
-       pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
-
-       eor             t3.16b, t3.16b, t4.16b                  // L = E + F
-       eor             t5.16b, t5.16b, t6.16b                  // M = G + H
-       eor             t7.16b, t7.16b, t8.16b                  // N = I + J
-
-       uzp1            t4.2d, t3.2d, t5.2d
-       uzp2            t3.2d, t3.2d, t5.2d
-       uzp1            t6.2d, t7.2d, t9.2d
-       uzp2            t7.2d, t7.2d, t9.2d
-
-       // t3 = (L) (P0 + P1) << 8
-       // t5 = (M) (P2 + P3) << 16
-       eor             t4.16b, t4.16b, t3.16b
-       and             t3.16b, t3.16b, k32_48.16b
-
-       // t7 = (N) (P4 + P5) << 24
-       // t9 = (K) (P6 + P7) << 32
-       eor             t6.16b, t6.16b, t7.16b
-       and             t7.16b, t7.16b, k00_16.16b
-
-       eor             t4.16b, t4.16b, t3.16b
-       eor             t6.16b, t6.16b, t7.16b
-
-       zip2            t5.2d, t4.2d, t3.2d
-       zip1            t3.2d, t4.2d, t3.2d
-       zip2            t9.2d, t6.2d, t7.2d
-       zip1            t7.2d, t6.2d, t7.2d
-
-       ext             t3.16b, t3.16b, t3.16b, #15
-       ext             t5.16b, t5.16b, t5.16b, #14
-       ext             t7.16b, t7.16b, t7.16b, #13
-       ext             t9.16b, t9.16b, t9.16b, #12
-
-       eor             t3.16b, t3.16b, t5.16b
-       eor             t7.16b, t7.16b, t9.16b
-       eor             \rq\().16b, \rq\().16b, t3.16b
-       eor             \rq\().16b, \rq\().16b, t7.16b
-       .endm
-
        .macro          __pmull_pre_p64
        add             x8, x3, #16
        ld1             {HH.2d-HH4.2d}, [x8]
        shl             MASK.2d, MASK.2d, #57
        .endm
 
-       .macro          __pmull_pre_p8
-       ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
-       eor             SHASH2.16b, SHASH2.16b, SHASH.16b
-
-       // k00_16 := 0x0000000000000000_000000000000ffff
-       // k32_48 := 0x00000000ffffffff_0000ffffffffffff
-       movi            k32_48.2d, #0xffffffff
-       mov             k32_48.h[2], k32_48.h[0]
-       ushr            k00_16.2d, k32_48.2d, #32
-
-       // prepare the permutation vectors
-       mov_q           x5, 0x080f0e0d0c0b0a09
-       movi            T1.8b, #8
-       dup             perm1.2d, x5
-       eor             perm1.16b, perm1.16b, T1.16b
-       ushr            perm2.2d, perm1.2d, #8
-       ushr            perm3.2d, perm1.2d, #16
-       ushr            T1.2d, perm1.2d, #24
-       sli             perm2.2d, perm1.2d, #56
-       sli             perm3.2d, perm1.2d, #48
-       sli             T1.2d, perm1.2d, #40
-
-       // precompute loop invariants
-       tbl             sh1.16b, {SHASH.16b}, perm1.16b
-       tbl             sh2.16b, {SHASH.16b}, perm2.16b
-       tbl             sh3.16b, {SHASH.16b}, perm3.16b
-       tbl             sh4.16b, {SHASH.16b}, T1.16b
-       ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
-       ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
-       ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
-       ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
-       .endm
-
-       //
-       // PMULL (64x64->128) based reduction for CPUs that can do
-       // it in a single instruction.
-       //
        .macro          __pmull_reduce_p64
        pmull           T2.1q, XL.1d, MASK.1d
        eor             XM.16b, XM.16b, T1.16b
        pmull           XL.1q, XL.1d, MASK.1d
        .endm
 
-       //
-       // Alternative reduction for CPUs that lack support for the
-       // 64x64->128 PMULL instruction
-       //
-       .macro          __pmull_reduce_p8
-       eor             XM.16b, XM.16b, T1.16b
-
-       mov             XL.d[1], XM.d[0]
-       mov             XH.d[0], XM.d[1]
-
-       shl             T1.2d, XL.2d, #57
-       shl             T2.2d, XL.2d, #62
-       eor             T2.16b, T2.16b, T1.16b
-       shl             T1.2d, XL.2d, #63
-       eor             T2.16b, T2.16b, T1.16b
-       ext             T1.16b, XL.16b, XH.16b, #8
-       eor             T2.16b, T2.16b, T1.16b
-
-       mov             XL.d[1], T2.d[0]
-       mov             XH.d[0], T2.d[1]
-
-       ushr            T2.2d, XL.2d, #1
-       eor             XH.16b, XH.16b, XL.16b
-       eor             XL.16b, XL.16b, T2.16b
-       ushr            T2.2d, T2.2d, #6
-       ushr            XL.2d, XL.2d, #1
-       .endm
-
-       .macro          __pmull_ghash, pn
+       /*
+        * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+        *                             u64 const h[][2], const char *head)
+        */
+SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
        ld1             {SHASH.2d}, [x3]
        ld1             {XL.2d}, [x1]
 
-       __pmull_pre_\pn
+       __pmull_pre_p64
 
        /* do the head block first, if supplied */
        cbz             x4, 0f
        mov             x4, xzr
        b               3f
 
-0:     .ifc            \pn, p64
+0:
        tbnz            w0, #0, 2f              // skip until #blocks is a
        tbnz            w0, #1, 2f              // round multiple of 4
 
 
        cbz             w0, 5f
        b               1b
-       .endif
 
 2:     ld1             {T1.2d}, [x2], #16
        sub             w0, w0, #1
@@ -327,16 +156,16 @@ CPU_LE(   rev64           T1.16b, T1.16b  )
        eor             T1.16b, T1.16b, T2.16b
        eor             XL.16b, XL.16b, IN1.16b
 
-       __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
+       pmull2          XH.1q, XL.2d, SHASH.2d          // a1 * b1
        eor             T1.16b, T1.16b, XL.16b
-       __pmull_\pn     XL, XL, SHASH                   // a0 * b0
-       __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
+       pmull           XL.1q, XL.1d, SHASH.1d          // a0 * b0
+       pmull           XM.1q, T1.1d, SHASH2.1d         // (a1 + a0)(b1 + b0)
 
 4:     eor             T2.16b, XL.16b, XH.16b
        ext             T1.16b, XL.16b, XH.16b, #8
        eor             XM.16b, XM.16b, T2.16b
 
-       __pmull_reduce_\pn
+       __pmull_reduce_p64
 
        eor             T2.16b, T2.16b, XH.16b
        eor             XL.16b, XL.16b, T2.16b
@@ -345,20 +174,8 @@ CPU_LE(    rev64           T1.16b, T1.16b  )
 
 5:     st1             {XL.2d}, [x1]
        ret
-       .endm
-
-       /*
-        * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-        *                         struct ghash_key const *k, const char *head)
-        */
-SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
-       __pmull_ghash   p64
 SYM_FUNC_END(pmull_ghash_update_p64)
 
-SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
-       __pmull_ghash   p8
-SYM_FUNC_END(pmull_ghash_update_p8)
-
        KS0             .req    v8
        KS1             .req    v9
        KS2             .req    v10
diff --git a/arch/arm64/crypto/ghash-neon-core.S b/arch/arm64/crypto/ghash-neon-core.S
new file mode 100644 (file)
index 0000000..6157135
--- /dev/null
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GHASH implementation with ARMv8 ASIMD instructions.
+ *
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/assembler.h>
+
+       SHASH           .req    v0
+       SHASH2          .req    v1
+       T1              .req    v2
+       T2              .req    v3
+       XM              .req    v5
+       XL              .req    v6
+       XH              .req    v7
+       IN1             .req    v7
+
+       k00_16          .req    v8
+       k32_48          .req    v9
+
+       t3              .req    v10
+       t4              .req    v11
+       t5              .req    v12
+       t6              .req    v13
+       t7              .req    v14
+       t8              .req    v15
+       t9              .req    v16
+
+       perm1           .req    v17
+       perm2           .req    v18
+       perm3           .req    v19
+
+       sh1             .req    v20
+       sh2             .req    v21
+       sh3             .req    v22
+       sh4             .req    v23
+
+       ss1             .req    v24
+       ss2             .req    v25
+       ss3             .req    v26
+       ss4             .req    v27
+
+       .text
+
+       .macro          __pmull_p8, rq, ad, bd
+       ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
+       ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
+       ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
+
+       __pmull_p8_\bd  \rq, \ad
+       .endm
+
+       .macro          __pmull2_p8, rq, ad, bd
+       tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
+       tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
+       tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
+
+       __pmull2_p8_\bd \rq, \ad
+       .endm
+
+       .macro          __pmull_p8_SHASH, rq, ad
+       __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
+       .endm
+
+       .macro          __pmull_p8_SHASH2, rq, ad
+       __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
+       .endm
+
+       .macro          __pmull2_p8_SHASH, rq, ad
+       __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
+       .endm
+
+       .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
+       pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
+       pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
+       pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
+       pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
+       pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
+       pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
+       pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
+       pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
+
+       eor             t3.16b, t3.16b, t4.16b                  // L = E + F
+       eor             t5.16b, t5.16b, t6.16b                  // M = G + H
+       eor             t7.16b, t7.16b, t8.16b                  // N = I + J
+
+       uzp1            t4.2d, t3.2d, t5.2d
+       uzp2            t3.2d, t3.2d, t5.2d
+       uzp1            t6.2d, t7.2d, t9.2d
+       uzp2            t7.2d, t7.2d, t9.2d
+
+       // t3 = (L) (P0 + P1) << 8
+       // t5 = (M) (P2 + P3) << 16
+       eor             t4.16b, t4.16b, t3.16b
+       and             t3.16b, t3.16b, k32_48.16b
+
+       // t7 = (N) (P4 + P5) << 24
+       // t9 = (K) (P6 + P7) << 32
+       eor             t6.16b, t6.16b, t7.16b
+       and             t7.16b, t7.16b, k00_16.16b
+
+       eor             t4.16b, t4.16b, t3.16b
+       eor             t6.16b, t6.16b, t7.16b
+
+       zip2            t5.2d, t4.2d, t3.2d
+       zip1            t3.2d, t4.2d, t3.2d
+       zip2            t9.2d, t6.2d, t7.2d
+       zip1            t7.2d, t6.2d, t7.2d
+
+       ext             t3.16b, t3.16b, t3.16b, #15
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ext             t7.16b, t7.16b, t7.16b, #13
+       ext             t9.16b, t9.16b, t9.16b, #12
+
+       eor             t3.16b, t3.16b, t5.16b
+       eor             t7.16b, t7.16b, t9.16b
+       eor             \rq\().16b, \rq\().16b, t3.16b
+       eor             \rq\().16b, \rq\().16b, t7.16b
+       .endm
+
+       .macro          __pmull_pre_p8
+       ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
+       eor             SHASH2.16b, SHASH2.16b, SHASH.16b
+
+       // k00_16 := 0x0000000000000000_000000000000ffff
+       // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+       movi            k32_48.2d, #0xffffffff
+       mov             k32_48.h[2], k32_48.h[0]
+       ushr            k00_16.2d, k32_48.2d, #32
+
+       // prepare the permutation vectors
+       mov_q           x5, 0x080f0e0d0c0b0a09
+       movi            T1.8b, #8
+       dup             perm1.2d, x5
+       eor             perm1.16b, perm1.16b, T1.16b
+       ushr            perm2.2d, perm1.2d, #8
+       ushr            perm3.2d, perm1.2d, #16
+       ushr            T1.2d, perm1.2d, #24
+       sli             perm2.2d, perm1.2d, #56
+       sli             perm3.2d, perm1.2d, #48
+       sli             T1.2d, perm1.2d, #40
+
+       // precompute loop invariants
+       tbl             sh1.16b, {SHASH.16b}, perm1.16b
+       tbl             sh2.16b, {SHASH.16b}, perm2.16b
+       tbl             sh3.16b, {SHASH.16b}, perm3.16b
+       tbl             sh4.16b, {SHASH.16b}, T1.16b
+       ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
+       ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
+       ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
+       ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
+       .endm
+
+       .macro          __pmull_reduce_p8
+       eor             XM.16b, XM.16b, T1.16b
+
+       mov             XL.d[1], XM.d[0]
+       mov             XH.d[0], XM.d[1]
+
+       shl             T1.2d, XL.2d, #57
+       shl             T2.2d, XL.2d, #62
+       eor             T2.16b, T2.16b, T1.16b
+       shl             T1.2d, XL.2d, #63
+       eor             T2.16b, T2.16b, T1.16b
+       ext             T1.16b, XL.16b, XH.16b, #8
+       eor             T2.16b, T2.16b, T1.16b
+
+       mov             XL.d[1], T2.d[0]
+       mov             XH.d[0], T2.d[1]
+
+       ushr            T2.2d, XL.2d, #1
+       eor             XH.16b, XH.16b, XL.16b
+       eor             XL.16b, XL.16b, T2.16b
+       ushr            T2.2d, T2.2d, #6
+       ushr            XL.2d, XL.2d, #1
+       .endm
+
+       /*
+        * void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+        *                            u64 const h[][2], const char *head)
+        */
+SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
+       ld1             {SHASH.2d}, [x3]
+       ld1             {XL.2d}, [x1]
+
+       __pmull_pre_p8
+
+       /* do the head block first, if supplied */
+       cbz             x4, 0f
+       ld1             {T1.2d}, [x4]
+       mov             x4, xzr
+       b               3f
+
+0:     ld1             {T1.2d}, [x2], #16
+       sub             w0, w0, #1
+
+3:     /* multiply XL by SHASH in GF(2^128) */
+CPU_LE(        rev64           T1.16b, T1.16b  )
+
+       ext             T2.16b, XL.16b, XL.16b, #8
+       ext             IN1.16b, T1.16b, T1.16b, #8
+       eor             T1.16b, T1.16b, T2.16b
+       eor             XL.16b, XL.16b, IN1.16b
+
+       __pmull2_p8     XH, XL, SHASH                   // a1 * b1
+       eor             T1.16b, T1.16b, XL.16b
+       __pmull_p8      XL, XL, SHASH                   // a0 * b0
+       __pmull_p8      XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
+
+       eor             T2.16b, XL.16b, XH.16b
+       ext             T1.16b, XL.16b, XH.16b, #8
+       eor             XM.16b, XM.16b, T2.16b
+
+       __pmull_reduce_p8
+
+       eor             T2.16b, T2.16b, XH.16b
+       eor             XL.16b, XL.16b, T2.16b
+
+       cbnz            w0, 0b
+
+       st1             {XL.2d}, [x1]
+       ret
+SYM_FUNC_END(pmull_ghash_update_p8)