]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
lib/crypto: x86/ghash: Migrate optimized code into library
authorEric Biggers <ebiggers@kernel.org>
Thu, 19 Mar 2026 06:17:15 +0000 (23:17 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 23 Mar 2026 23:44:29 +0000 (16:44 -0700)
Remove the "ghash-pclmulqdqni" crypto_shash algorithm.  Move the
corresponding assembly code into lib/crypto/, and wire it up to the
GHASH library.

This makes the GHASH library be optimized with x86's carryless
multiplication instructions.  It also greatly reduces the amount of
x86-specific glue code that is needed, and it fixes the issue where this
GHASH optimization was disabled by default.

Rename and adjust the prototypes of the assembly functions to make them
fit better with the library.  Remove the byte-swaps (pshufb
instructions) that are no longer necessary because the library keeps the
accumulator in POLYVAL format rather than GHASH format.

Rename clmul_ghash_mul() to polyval_mul_pclmul() to reflect that it
really does a POLYVAL style multiplication.  Wire it up to both
ghash_mul_arch() and polyval_mul_arch().

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260319061723.1140720-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
arch/x86/crypto/Kconfig
arch/x86/crypto/Makefile
arch/x86/crypto/ghash-clmulni-intel_glue.c [deleted file]
lib/crypto/Makefile
lib/crypto/x86/gf128hash.h
lib/crypto/x86/ghash-pclmul.S [moved from arch/x86/crypto/ghash-clmulni-intel_asm.S with 54% similarity]

index 7fb2319a0916409559f8f421a9eb76e14ae680e6..905e8a23cec3abf67026e0c21f24909bb0e2acff 100644 (file)
@@ -344,14 +344,4 @@ config CRYPTO_SM3_AVX_X86_64
 
          If unsure, say N.
 
-config CRYPTO_GHASH_CLMUL_NI_INTEL
-       tristate "Hash functions: GHASH (CLMUL-NI)"
-       depends on 64BIT
-       select CRYPTO_CRYPTD
-       help
-         GCM GHASH hash function (NIST SP800-38D)
-
-         Architecture: x86_64 using:
-         - CLMUL-NI (carry-less multiplication new instructions)
-
 endmenu
index b21ad0978c52bbd265e5c34a7b29406caba96b3c..d562f4341da69d6d51c01e467c7af395529d1a8a 100644 (file)
@@ -50,9 +50,6 @@ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
                               aes-gcm-vaes-avx512.o \
                               aes-xts-avx-x86_64.o
 
-obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
-ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-
 obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o
 sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o
 
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
deleted file mode 100644 (file)
index aea5d4d..0000000
+++ /dev/null
@@ -1,163 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
- * instructions. This file contains glue code.
- *
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/b128ops.h>
-#include <crypto/ghash.h>
-#include <crypto/internal/hash.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash);
-
-asmlinkage int clmul_ghash_update(char *dst, const char *src,
-                                 unsigned int srclen, const le128 *shash);
-
-struct x86_ghash_ctx {
-       le128 shash;
-};
-
-static int ghash_init(struct shash_desc *desc)
-{
-       struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-
-       memset(dctx, 0, sizeof(*dctx));
-
-       return 0;
-}
-
-static int ghash_setkey(struct crypto_shash *tfm,
-                       const u8 *key, unsigned int keylen)
-{
-       struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm);
-       u64 a, b;
-
-       if (keylen != GHASH_BLOCK_SIZE)
-               return -EINVAL;
-
-       /*
-        * GHASH maps bits to polynomial coefficients backwards, which makes it
-        * hard to implement.  But it can be shown that the GHASH multiplication
-        *
-        *      D * K (mod x^128 + x^7 + x^2 + x + 1)
-        *
-        * (where D is a data block and K is the key) is equivalent to:
-        *
-        *      bitreflect(D) * bitreflect(K) * x^(-127)
-        *              (mod x^128 + x^127 + x^126 + x^121 + 1)
-        *
-        * So, the code below precomputes:
-        *
-        *      bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1)
-        *
-        * ... but in Montgomery form (so that Montgomery multiplication can be
-        * used), i.e. with an extra x^128 factor, which means actually:
-        *
-        *      bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1)
-        *
-        * The within-a-byte part of bitreflect() cancels out GHASH's built-in
-        * reflection, and thus bitreflect() is actually a byteswap.
-        */
-       a = get_unaligned_be64(key);
-       b = get_unaligned_be64(key + 8);
-       ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63));
-       ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63));
-       if (a >> 63)
-               ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56);
-       return 0;
-}
-
-static int ghash_update(struct shash_desc *desc,
-                        const u8 *src, unsigned int srclen)
-{
-       struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
-       struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-       u8 *dst = dctx->buffer;
-       int remain;
-
-       kernel_fpu_begin();
-       remain = clmul_ghash_update(dst, src, srclen, &ctx->shash);
-       kernel_fpu_end();
-       return remain;
-}
-
-static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx,
-                       const u8 *src, unsigned int len)
-{
-       u8 *dst = dctx->buffer;
-
-       kernel_fpu_begin();
-       if (len) {
-               crypto_xor(dst, src, len);
-               clmul_ghash_mul(dst, &ctx->shash);
-       }
-       kernel_fpu_end();
-}
-
-static int ghash_finup(struct shash_desc *desc, const u8 *src,
-                      unsigned int len, u8 *dst)
-{
-       struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
-       struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-       u8 *buf = dctx->buffer;
-
-       ghash_flush(ctx, dctx, src, len);
-       memcpy(dst, buf, GHASH_BLOCK_SIZE);
-
-       return 0;
-}
-
-static struct shash_alg ghash_alg = {
-       .digestsize     = GHASH_DIGEST_SIZE,
-       .init           = ghash_init,
-       .update         = ghash_update,
-       .finup          = ghash_finup,
-       .setkey         = ghash_setkey,
-       .descsize       = sizeof(struct ghash_desc_ctx),
-       .base           = {
-               .cra_name               = "ghash",
-               .cra_driver_name        = "ghash-pclmulqdqni",
-               .cra_priority           = 400,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize          = GHASH_BLOCK_SIZE,
-               .cra_ctxsize            = sizeof(struct x86_ghash_ctx),
-               .cra_module             = THIS_MODULE,
-       },
-};
-
-static const struct x86_cpu_id pcmul_cpu_id[] = {
-       X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */
-       {}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init ghash_pclmulqdqni_mod_init(void)
-{
-       if (!x86_match_cpu(pcmul_cpu_id))
-               return -ENODEV;
-
-       return crypto_register_shash(&ghash_alg);
-}
-
-static void __exit ghash_pclmulqdqni_mod_exit(void)
-{
-       crypto_unregister_shash(&ghash_alg);
-}
-
-module_init(ghash_pclmulqdqni_mod_init);
-module_exit(ghash_pclmulqdqni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("ghash");
index 8950509833afe3133647bb2045bf4e43b83856b3..19c67f70fb3866c15e36df4930668155eafae6fc 100644 (file)
@@ -174,7 +174,8 @@ OBJECT_FILES_NON_STANDARD_powerpc/ghashp8-ppc.o := y
 endif
 
 libgf128hash-$(CONFIG_RISCV) += riscv/ghash-riscv64-zvkg.o
-libgf128hash-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+libgf128hash-$(CONFIG_X86) += x86/ghash-pclmul.o \
+                             x86/polyval-pclmul-avx.o
 endif # CONFIG_CRYPTO_LIB_GF128HASH_ARCH
 
 # clean-files must be defined unconditionally
index adf6147ea6779f673e8173b06f40029ad2d72037..6b79b06caab0a3e03d4c9a13657dd14f737edf3d 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * POLYVAL library functions, x86_64 optimized
+ * GHASH and POLYVAL, x86_64 optimized
  *
  * Copyright 2025 Google LLC
  */
@@ -9,10 +9,17 @@
 
 #define NUM_H_POWERS 8
 
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
 
+asmlinkage void polyval_mul_pclmul(struct polyval_elem *a,
+                                  const struct polyval_elem *b);
 asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
                                       const struct polyval_elem *b);
+
+asmlinkage void ghash_blocks_pclmul(struct polyval_elem *acc,
+                                   const struct polyval_elem *key,
+                                   const u8 *data, size_t nblocks);
 asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
                                          const struct polyval_key *key,
                                          const u8 *data, size_t nblocks);
@@ -41,16 +48,54 @@ static void polyval_preparekey_arch(struct polyval_key *key,
        }
 }
 
+static void polyval_mul_x86(struct polyval_elem *a,
+                           const struct polyval_elem *b)
+{
+       if (static_branch_likely(&have_pclmul) && irq_fpu_usable()) {
+               kernel_fpu_begin();
+               if (static_branch_likely(&have_pclmul_avx))
+                       polyval_mul_pclmul_avx(a, b);
+               else
+                       polyval_mul_pclmul(a, b);
+               kernel_fpu_end();
+       } else {
+               polyval_mul_generic(a, b);
+       }
+}
+
+#define ghash_mul_arch ghash_mul_arch
+static void ghash_mul_arch(struct polyval_elem *acc,
+                          const struct ghash_key *key)
+{
+       polyval_mul_x86(acc, &key->h);
+}
+
 #define polyval_mul_arch polyval_mul_arch
 static void polyval_mul_arch(struct polyval_elem *acc,
                             const struct polyval_key *key)
 {
-       if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
-               kernel_fpu_begin();
-               polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
-               kernel_fpu_end();
+       polyval_mul_x86(acc, &key->h_powers[NUM_H_POWERS - 1]);
+}
+
+#define ghash_blocks_arch ghash_blocks_arch
+static void ghash_blocks_arch(struct polyval_elem *acc,
+                             const struct ghash_key *key,
+                             const u8 *data, size_t nblocks)
+{
+       if (static_branch_likely(&have_pclmul) && irq_fpu_usable()) {
+               do {
+                       /* Allow rescheduling every 4 KiB. */
+                       size_t n = min_t(size_t, nblocks,
+                                        4096 / GHASH_BLOCK_SIZE);
+
+                       kernel_fpu_begin();
+                       ghash_blocks_pclmul(acc, &key->h, data, n);
+                       kernel_fpu_end();
+                       data += n * GHASH_BLOCK_SIZE;
+                       nblocks -= n;
+               } while (nblocks);
        } else {
-               polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+               ghash_blocks_generic(acc, &key->h, data, nblocks);
        }
 }
 
@@ -80,7 +125,9 @@ static void polyval_blocks_arch(struct polyval_elem *acc,
 #define gf128hash_mod_init_arch gf128hash_mod_init_arch
 static void gf128hash_mod_init_arch(void)
 {
-       if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
-           boot_cpu_has(X86_FEATURE_AVX))
-               static_branch_enable(&have_pclmul_avx);
+       if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+               static_branch_enable(&have_pclmul);
+               if (boot_cpu_has(X86_FEATURE_AVX))
+                       static_branch_enable(&have_pclmul_avx);
+       }
 }
similarity index 54%
rename from arch/x86/crypto/ghash-clmulni-intel_asm.S
rename to lib/crypto/x86/ghash-pclmul.S
index c4fbaa82ed7a7f546ba752627bae648b47d08674..6ffb5aea6063d05f1e96adb3af073804b25d6865 100644 (file)
@@ -21,8 +21,8 @@
 .Lbswap_mask:
        .octa 0x000102030405060708090a0b0c0d0e0f
 
-#define DATA   %xmm0
-#define SHASH  %xmm1
+#define ACC    %xmm0
+#define KEY    %xmm1
 #define T1     %xmm2
 #define T2     %xmm3
 #define T3     %xmm4
 /*
  * __clmul_gf128mul_ble:       internal ABI
  * input:
- *     DATA:                   operand1
- *     SHASH:                  operand2, hash_key << 1 mod poly
+ *     ACC:                    operand1
+ *     KEY:                    operand2, hash_key << 1 mod poly
  * output:
- *     DATA:                   operand1 * operand2 mod poly
+ *     ACC:                    operand1 * operand2 mod poly
  * changed:
  *     T1
  *     T2
  *     T3
  */
 SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
-       movaps DATA, T1
-       pshufd $0b01001110, DATA, T2
-       pshufd $0b01001110, SHASH, T3
-       pxor DATA, T2
-       pxor SHASH, T3
+       movaps ACC, T1
+       pshufd $0b01001110, ACC, T2
+       pshufd $0b01001110, KEY, T3
+       pxor ACC, T2
+       pxor KEY, T3
 
-       pclmulqdq $0x00, SHASH, DATA    # DATA = a0 * b0
-       pclmulqdq $0x11, SHASH, T1      # T1 = a1 * b1
+       pclmulqdq $0x00, KEY, ACC       # ACC = a0 * b0
+       pclmulqdq $0x11, KEY, T1        # T1 = a1 * b1
        pclmulqdq $0x00, T3, T2         # T2 = (a1 + a0) * (b1 + b0)
-       pxor DATA, T2
+       pxor ACC, T2
        pxor T1, T2                     # T2 = a0 * b1 + a1 * b0
 
        movaps T2, T3
        pslldq $8, T3
        psrldq $8, T2
-       pxor T3, DATA
-       pxor T2, T1                     # <T1:DATA> is result of
+       pxor T3, ACC
+       pxor T2, T1                     # <T1:ACC> is result of
                                        # carry-less multiplication
 
        # first phase of the reduction
-       movaps DATA, T3
+       movaps ACC, T3
        psllq $1, T3
-       pxor DATA, T3
+       pxor ACC, T3
        psllq $5, T3
-       pxor DATA, T3
+       pxor ACC, T3
        psllq $57, T3
        movaps T3, T2
        pslldq $8, T2
        psrldq $8, T3
-       pxor T2, DATA
+       pxor T2, ACC
        pxor T3, T1
 
        # second phase of the reduction
-       movaps DATA, T2
+       movaps ACC, T2
        psrlq $5, T2
-       pxor DATA, T2
+       pxor ACC, T2
        psrlq $1, T2
-       pxor DATA, T2
+       pxor ACC, T2
        psrlq $1, T2
        pxor T2, T1
-       pxor T1, DATA
+       pxor T1, ACC
        RET
 SYM_FUNC_END(__clmul_gf128mul_ble)
 
-/* void clmul_ghash_mul(char *dst, const le128 *shash) */
-SYM_FUNC_START(clmul_ghash_mul)
+/*
+ * void polyval_mul_pclmul(struct polyval_elem *a,
+ *                        const struct polyval_elem *b)
+ */
+SYM_FUNC_START(polyval_mul_pclmul)
        FRAME_BEGIN
-       movups (%rdi), DATA
-       movups (%rsi), SHASH
-       movaps .Lbswap_mask(%rip), BSWAP
-       pshufb BSWAP, DATA
+       movups (%rdi), ACC
+       movups (%rsi), KEY
        call __clmul_gf128mul_ble
-       pshufb BSWAP, DATA
-       movups DATA, (%rdi)
+       movups ACC, (%rdi)
        FRAME_END
        RET
-SYM_FUNC_END(clmul_ghash_mul)
+SYM_FUNC_END(polyval_mul_pclmul)
 
 /*
- * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- *                       const le128 *shash);
+ * void ghash_blocks_pclmul(struct polyval_elem *acc,
+ *                         const struct polyval_elem *key,
+ *                         const u8 *data, size_t nblocks)
  */
-SYM_FUNC_START(clmul_ghash_update)
+SYM_FUNC_START(ghash_blocks_pclmul)
        FRAME_BEGIN
-       cmp $16, %rdx
-       jb .Lupdate_just_ret    # check length
        movaps .Lbswap_mask(%rip), BSWAP
-       movups (%rdi), DATA
-       movups (%rcx), SHASH
-       pshufb BSWAP, DATA
+       movups (%rdi), ACC
+       movups (%rsi), KEY
 .align 4
-.Lupdate_loop:
-       movups (%rsi), IN1
+.Lnext_block:
+       movups (%rdx), IN1
        pshufb BSWAP, IN1
-       pxor IN1, DATA
+       pxor IN1, ACC
        call __clmul_gf128mul_ble
-       sub $16, %rdx
-       add $16, %rsi
-       cmp $16, %rdx
-       jge .Lupdate_loop
-       pshufb BSWAP, DATA
-       movups DATA, (%rdi)
-.Lupdate_just_ret:
-       mov %rdx, %rax
+       add $16, %rdx
+       dec %rcx
+       jnz .Lnext_block
+       movups ACC, (%rdi)
        FRAME_END
        RET
-SYM_FUNC_END(clmul_ghash_update)
+SYM_FUNC_END(ghash_blocks_pclmul)