From: Eric Biggers Date: Thu, 19 Mar 2026 06:17:09 +0000 (-0700) Subject: crypto: arm64/ghash - Move NEON GHASH assembly into its own file X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e3f473db02dae210f91e8eb8d0423232175639d6;p=thirdparty%2Fkernel%2Fstable.git crypto: arm64/ghash - Move NEON GHASH assembly into its own file arch/arm64/crypto/ghash-ce-core.S implements pmull_ghash_update_p8(), which is used only by a crypto_shash implementation of GHASH. It also implements other functions, including pmull_ghash_update_p64() and others, which are used only by a crypto_aead implementation of AES-GCM. While some code is shared between pmull_ghash_update_p8() and pmull_ghash_update_p64(), it's not very much. Since pmull_ghash_update_p8() will also need to be migrated into lib/crypto/ to achieve parity in the standalone GHASH support, let's move it into a separate file ghash-neon-core.S. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260319061723.1140720-9-ebiggers@kernel.org Signed-off-by: Eric Biggers --- diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 8a8e3e551ed3..b7ba43ce8584 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o -ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o ghash-neon-core.o obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index 23ee9a5eaf27..4344fe213d14 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions. * * Copyright (C) 2014 - 2018 Linaro Ltd. */ @@ -19,31 +19,6 @@ XH .req v7 IN1 .req v7 - k00_16 .req v8 - k32_48 .req v9 - - t3 .req v10 - t4 .req v11 - t5 .req v12 - t6 .req v13 - t7 .req v14 - t8 .req v15 - t9 .req v16 - - perm1 .req v17 - perm2 .req v18 - perm3 .req v19 - - sh1 .req v20 - sh2 .req v21 - sh3 .req v22 - sh4 .req v23 - - ss1 .req v24 - ss2 .req v25 - ss3 .req v26 - ss4 .req v27 - XL2 .req v8 XM2 .req v9 XH2 .req v10 @@ -60,90 +35,6 @@ .text .arch armv8-a+crypto - .macro __pmull_p64, rd, rn, rm - pmull \rd\().1q, \rn\().1d, \rm\().1d - .endm - - .macro __pmull2_p64, rd, rn, rm - pmull2 \rd\().1q, \rn\().2d, \rm\().2d - .endm - - .macro __pmull_p8, rq, ad, bd - ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 - ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 - ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 - - __pmull_p8_\bd \rq, \ad - .endm - - .macro __pmull2_p8, rq, ad, bd - tbl t3.16b, {\ad\().16b}, perm1.16b // A1 - tbl t5.16b, {\ad\().16b}, perm2.16b // A2 - tbl t7.16b, {\ad\().16b}, perm3.16b // A3 - - __pmull2_p8_\bd \rq, \ad - .endm - - .macro __pmull_p8_SHASH, rq, ad - __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 - .endm - - .macro __pmull_p8_SHASH2, rq, ad - __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 - .endm - - .macro __pmull2_p8_SHASH, rq, ad - __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 - .endm - - .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 - pmull\t t3.8h, t3.\nb, \bd // F = A1*B - pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 - pmull\t t5.8h, t5.\nb, \bd // H = A2*B - pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 - pmull\t t7.8h, t7.\nb, \bd // J = A3*B - pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 - pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 - pmull\t \rq\().8h, \ad, \bd // D = A*B - - eor t3.16b, t3.16b, t4.16b // L = E + F - eor t5.16b, t5.16b, t6.16b // M = G + H - eor t7.16b, t7.16b, t8.16b // N = I + J - - uzp1 t4.2d, t3.2d, t5.2d - uzp2 t3.2d, t3.2d, t5.2d - uzp1 t6.2d, t7.2d, t9.2d - uzp2 t7.2d, t7.2d, t9.2d - - // t3 = (L) (P0 + P1) << 8 - // t5 = (M) (P2 + P3) << 16 - eor t4.16b, t4.16b, t3.16b - and t3.16b, t3.16b, k32_48.16b - - // t7 = (N) (P4 + P5) << 24 - // t9 = (K) (P6 + P7) << 32 - eor t6.16b, t6.16b, t7.16b - and t7.16b, t7.16b, k00_16.16b - - eor t4.16b, t4.16b, t3.16b - eor t6.16b, t6.16b, t7.16b - - zip2 t5.2d, t4.2d, t3.2d - zip1 t3.2d, t4.2d, t3.2d - zip2 t9.2d, t6.2d, t7.2d - zip1 t7.2d, t6.2d, t7.2d - - ext t3.16b, t3.16b, t3.16b, #15 - ext t5.16b, t5.16b, t5.16b, #14 - ext t7.16b, t7.16b, t7.16b, #13 - ext t9.16b, t9.16b, t9.16b, #12 - - eor t3.16b, t3.16b, t5.16b - eor t7.16b, t7.16b, t9.16b - eor \rq\().16b, \rq\().16b, t3.16b - eor \rq\().16b, \rq\().16b, t7.16b - .endm - .macro __pmull_pre_p64 add x8, x3, #16 ld1 {HH.2d-HH4.2d}, [x8] @@ -160,43 +51,6 @@ shl MASK.2d, MASK.2d, #57 .endm - .macro __pmull_pre_p8 - ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - eor SHASH2.16b, SHASH2.16b, SHASH.16b - - // k00_16 := 0x0000000000000000_000000000000ffff - // k32_48 := 0x00000000ffffffff_0000ffffffffffff - movi k32_48.2d, #0xffffffff - mov k32_48.h[2], k32_48.h[0] - ushr k00_16.2d, k32_48.2d, #32 - - // prepare the permutation vectors - mov_q x5, 0x080f0e0d0c0b0a09 - movi T1.8b, #8 - dup perm1.2d, x5 - eor perm1.16b, perm1.16b, T1.16b - ushr perm2.2d, perm1.2d, #8 - ushr perm3.2d, perm1.2d, #16 - ushr T1.2d, perm1.2d, #24 - sli perm2.2d, perm1.2d, #56 - sli perm3.2d, perm1.2d, #48 - sli T1.2d, perm1.2d, #40 - - // precompute loop invariants - tbl sh1.16b, {SHASH.16b}, perm1.16b - tbl sh2.16b, {SHASH.16b}, perm2.16b - tbl sh3.16b, {SHASH.16b}, perm3.16b - tbl sh4.16b, {SHASH.16b}, T1.16b - ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 - ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 - ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 - ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 - .endm - - // - // PMULL (64x64->128) based reduction for CPUs that can do - // it in a single instruction. - // .macro __pmull_reduce_p64 pmull T2.1q, XL.1d, MASK.1d eor XM.16b, XM.16b, T1.16b @@ -209,39 +63,15 @@ pmull XL.1q, XL.1d, MASK.1d .endm - // - // Alternative reduction for CPUs that lack support for the - // 64x64->128 PMULL instruction - // - .macro __pmull_reduce_p8 - eor XM.16b, XM.16b, T1.16b - - mov XL.d[1], XM.d[0] - mov XH.d[0], XM.d[1] - - shl T1.2d, XL.2d, #57 - shl T2.2d, XL.2d, #62 - eor T2.16b, T2.16b, T1.16b - shl T1.2d, XL.2d, #63 - eor T2.16b, T2.16b, T1.16b - ext T1.16b, XL.16b, XH.16b, #8 - eor T2.16b, T2.16b, T1.16b - - mov XL.d[1], T2.d[0] - mov XH.d[0], T2.d[1] - - ushr T2.2d, XL.2d, #1 - eor XH.16b, XH.16b, XL.16b - eor XL.16b, XL.16b, T2.16b - ushr T2.2d, T2.2d, #6 - ushr XL.2d, XL.2d, #1 - .endm - - .macro __pmull_ghash, pn + /* + * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + * u64 const h[][2], const char *head) + */ +SYM_TYPED_FUNC_START(pmull_ghash_update_p64) ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] - __pmull_pre_\pn + __pmull_pre_p64 /* do the head block first, if supplied */ cbz x4, 0f @@ -249,7 +79,7 @@ mov x4, xzr b 3f -0: .ifc \pn, p64 +0: tbnz w0, #0, 2f // skip until #blocks is a tbnz w0, #1, 2f // round multiple of 4 @@ -314,7 +144,6 @@ cbz w0, 5f b 1b - .endif 2: ld1 {T1.2d}, [x2], #16 sub w0, w0, #1 @@ -327,16 +156,16 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b - __pmull2_\pn XH, XL, SHASH // a1 * b1 + pmull2 XH.1q, XL.2d, SHASH.2d // a1 * b1 eor T1.16b, T1.16b, XL.16b - __pmull_\pn XL, XL, SHASH // a0 * b0 - __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) + pmull XL.1q, XL.1d, SHASH.1d // a0 * b0 + pmull XM.1q, T1.1d, SHASH2.1d // (a1 + a0)(b1 + b0) 4: eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b - __pmull_reduce_\pn + __pmull_reduce_p64 eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b @@ -345,20 +174,8 @@ CPU_LE( rev64 T1.16b, T1.16b ) 5: st1 {XL.2d}, [x1] ret - .endm - - /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) - */ -SYM_TYPED_FUNC_START(pmull_ghash_update_p64) - __pmull_ghash p64 SYM_FUNC_END(pmull_ghash_update_p64) -SYM_TYPED_FUNC_START(pmull_ghash_update_p8) - __pmull_ghash p8 -SYM_FUNC_END(pmull_ghash_update_p8) - KS0 .req v8 KS1 .req v9 KS2 .req v10 diff --git a/arch/arm64/crypto/ghash-neon-core.S b/arch/arm64/crypto/ghash-neon-core.S new file mode 100644 index 000000000000..6157135ad566 --- /dev/null +++ b/arch/arm64/crypto/ghash-neon-core.S @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Accelerated GHASH implementation with ARMv8 ASIMD instructions. + * + * Copyright (C) 2014 - 2018 Linaro Ltd. + */ + +#include +#include +#include + + SHASH .req v0 + SHASH2 .req v1 + T1 .req v2 + T2 .req v3 + XM .req v5 + XL .req v6 + XH .req v7 + IN1 .req v7 + + k00_16 .req v8 + k32_48 .req v9 + + t3 .req v10 + t4 .req v11 + t5 .req v12 + t6 .req v13 + t7 .req v14 + t8 .req v15 + t9 .req v16 + + perm1 .req v17 + perm2 .req v18 + perm3 .req v19 + + sh1 .req v20 + sh2 .req v21 + sh3 .req v22 + sh4 .req v23 + + ss1 .req v24 + ss2 .req v25 + ss3 .req v26 + ss4 .req v27 + + .text + + .macro __pmull_p8, rq, ad, bd + ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 + ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 + ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 + + __pmull_p8_\bd \rq, \ad + .endm + + .macro __pmull2_p8, rq, ad, bd + tbl t3.16b, {\ad\().16b}, perm1.16b // A1 + tbl t5.16b, {\ad\().16b}, perm2.16b // A2 + tbl t7.16b, {\ad\().16b}, perm3.16b // A3 + + __pmull2_p8_\bd \rq, \ad + .endm + + .macro __pmull_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_SHASH2, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 + .endm + + .macro __pmull2_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 + pmull\t t3.8h, t3.\nb, \bd // F = A1*B + pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 + pmull\t t5.8h, t5.\nb, \bd // H = A2*B + pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 + pmull\t t7.8h, t7.\nb, \bd // J = A3*B + pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 + pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 + pmull\t \rq\().8h, \ad, \bd // D = A*B + + eor t3.16b, t3.16b, t4.16b // L = E + F + eor t5.16b, t5.16b, t6.16b // M = G + H + eor t7.16b, t7.16b, t8.16b // N = I + J + + uzp1 t4.2d, t3.2d, t5.2d + uzp2 t3.2d, t3.2d, t5.2d + uzp1 t6.2d, t7.2d, t9.2d + uzp2 t7.2d, t7.2d, t9.2d + + // t3 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t4.16b, t4.16b, t3.16b + and t3.16b, t3.16b, k32_48.16b + + // t7 = (N) (P4 + P5) << 24 + // t9 = (K) (P6 + P7) << 32 + eor t6.16b, t6.16b, t7.16b + and t7.16b, t7.16b, k00_16.16b + + eor t4.16b, t4.16b, t3.16b + eor t6.16b, t6.16b, t7.16b + + zip2 t5.2d, t4.2d, t3.2d + zip1 t3.2d, t4.2d, t3.2d + zip2 t9.2d, t6.2d, t7.2d + zip1 t7.2d, t6.2d, t7.2d + + ext t3.16b, t3.16b, t3.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t7.16b, t7.16b, t7.16b, #13 + ext t9.16b, t9.16b, t9.16b, #12 + + eor t3.16b, t3.16b, t5.16b + eor t7.16b, t7.16b, t9.16b + eor \rq\().16b, \rq\().16b, t3.16b + eor \rq\().16b, \rq\().16b, t7.16b + .endm + + .macro __pmull_pre_p8 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi T1.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, T1.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr T1.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli T1.2d, perm1.2d, #40 + + // precompute loop invariants + tbl sh1.16b, {SHASH.16b}, perm1.16b + tbl sh2.16b, {SHASH.16b}, perm2.16b + tbl sh3.16b, {SHASH.16b}, perm3.16b + tbl sh4.16b, {SHASH.16b}, T1.16b + ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 + ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 + ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 + ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 + .endm + + .macro __pmull_reduce_p8 + eor XM.16b, XM.16b, T1.16b + + mov XL.d[1], XM.d[0] + mov XH.d[0], XM.d[1] + + shl T1.2d, XL.2d, #57 + shl T2.2d, XL.2d, #62 + eor T2.16b, T2.16b, T1.16b + shl T1.2d, XL.2d, #63 + eor T2.16b, T2.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, T2.16b, T1.16b + + mov XL.d[1], T2.d[0] + mov XH.d[0], T2.d[1] + + ushr T2.2d, XL.2d, #1 + eor XH.16b, XH.16b, XL.16b + eor XL.16b, XL.16b, T2.16b + ushr T2.2d, T2.2d, #6 + ushr XL.2d, XL.2d, #1 + .endm + + /* + * void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + * u64 const h[][2], const char *head) + */ +SYM_TYPED_FUNC_START(pmull_ghash_update_p8) + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] + + __pmull_pre_p8 + + /* do the head block first, if supplied */ + cbz x4, 0f + ld1 {T1.2d}, [x4] + mov x4, xzr + b 3f + +0: ld1 {T1.2d}, [x2], #16 + sub w0, w0, #1 + +3: /* multiply XL by SHASH in GF(2^128) */ +CPU_LE( rev64 T1.16b, T1.16b ) + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + __pmull2_p8 XH, XL, SHASH // a1 * b1 + eor T1.16b, T1.16b, XL.16b + __pmull_p8 XL, XL, SHASH // a0 * b0 + __pmull_p8 XM, T1, SHASH2 // (a1 + a0)(b1 + b0) + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_p8 + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbnz w0, 0b + + st1 {XL.2d}, [x1] + ret +SYM_FUNC_END(pmull_ghash_update_p8)