If unsure, say N.
-config CRYPTO_GHASH_CLMUL_NI_INTEL
- tristate "Hash functions: GHASH (CLMUL-NI)"
- depends on 64BIT
- select CRYPTO_CRYPTD
- help
- GCM GHASH hash function (NIST SP800-38D)
-
- Architecture: x86_64 using:
- - CLMUL-NI (carry-less multiplication new instructions)
-
endmenu
aes-gcm-vaes-avx512.o \
aes-xts-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
-ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-
obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o
sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
- * instructions. This file contains glue code.
- *
- * Copyright (c) 2009 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/b128ops.h>
-#include <crypto/ghash.h>
-#include <crypto/internal/hash.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash);
-
-asmlinkage int clmul_ghash_update(char *dst, const char *src,
- unsigned int srclen, const le128 *shash);
-
-struct x86_ghash_ctx {
- le128 shash;
-};
-
-static int ghash_init(struct shash_desc *desc)
-{
- struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-
- memset(dctx, 0, sizeof(*dctx));
-
- return 0;
-}
-
-static int ghash_setkey(struct crypto_shash *tfm,
- const u8 *key, unsigned int keylen)
-{
- struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm);
- u64 a, b;
-
- if (keylen != GHASH_BLOCK_SIZE)
- return -EINVAL;
-
- /*
- * GHASH maps bits to polynomial coefficients backwards, which makes it
- * hard to implement. But it can be shown that the GHASH multiplication
- *
- * D * K (mod x^128 + x^7 + x^2 + x + 1)
- *
- * (where D is a data block and K is the key) is equivalent to:
- *
- * bitreflect(D) * bitreflect(K) * x^(-127)
- * (mod x^128 + x^127 + x^126 + x^121 + 1)
- *
- * So, the code below precomputes:
- *
- * bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1)
- *
- * ... but in Montgomery form (so that Montgomery multiplication can be
- * used), i.e. with an extra x^128 factor, which means actually:
- *
- * bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1)
- *
- * The within-a-byte part of bitreflect() cancels out GHASH's built-in
- * reflection, and thus bitreflect() is actually a byteswap.
- */
- a = get_unaligned_be64(key);
- b = get_unaligned_be64(key + 8);
- ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63));
- ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63));
- if (a >> 63)
- ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56);
- return 0;
-}
-
-static int ghash_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
-{
- struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
- struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- u8 *dst = dctx->buffer;
- int remain;
-
- kernel_fpu_begin();
- remain = clmul_ghash_update(dst, src, srclen, &ctx->shash);
- kernel_fpu_end();
- return remain;
-}
-
-static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx,
- const u8 *src, unsigned int len)
-{
- u8 *dst = dctx->buffer;
-
- kernel_fpu_begin();
- if (len) {
- crypto_xor(dst, src, len);
- clmul_ghash_mul(dst, &ctx->shash);
- }
- kernel_fpu_end();
-}
-
-static int ghash_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *dst)
-{
- struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
- struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- u8 *buf = dctx->buffer;
-
- ghash_flush(ctx, dctx, src, len);
- memcpy(dst, buf, GHASH_BLOCK_SIZE);
-
- return 0;
-}
-
-static struct shash_alg ghash_alg = {
- .digestsize = GHASH_DIGEST_SIZE,
- .init = ghash_init,
- .update = ghash_update,
- .finup = ghash_finup,
- .setkey = ghash_setkey,
- .descsize = sizeof(struct ghash_desc_ctx),
- .base = {
- .cra_name = "ghash",
- .cra_driver_name = "ghash-pclmulqdqni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = GHASH_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct x86_ghash_ctx),
- .cra_module = THIS_MODULE,
- },
-};
-
-static const struct x86_cpu_id pcmul_cpu_id[] = {
- X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init ghash_pclmulqdqni_mod_init(void)
-{
- if (!x86_match_cpu(pcmul_cpu_id))
- return -ENODEV;
-
- return crypto_register_shash(&ghash_alg);
-}
-
-static void __exit ghash_pclmulqdqni_mod_exit(void)
-{
- crypto_unregister_shash(&ghash_alg);
-}
-
-module_init(ghash_pclmulqdqni_mod_init);
-module_exit(ghash_pclmulqdqni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("ghash");
endif
libgf128hash-$(CONFIG_RISCV) += riscv/ghash-riscv64-zvkg.o
-libgf128hash-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+libgf128hash-$(CONFIG_X86) += x86/ghash-pclmul.o \
+ x86/polyval-pclmul-avx.o
endif # CONFIG_CRYPTO_LIB_GF128HASH_ARCH
# clean-files must be defined unconditionally
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * POLYVAL library functions, x86_64 optimized
+ * GHASH and POLYVAL, x86_64 optimized
*
* Copyright 2025 Google LLC
*/
#define NUM_H_POWERS 8
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+asmlinkage void polyval_mul_pclmul(struct polyval_elem *a,
+ const struct polyval_elem *b);
asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
const struct polyval_elem *b);
+
+asmlinkage void ghash_blocks_pclmul(struct polyval_elem *acc,
+ const struct polyval_elem *key,
+ const u8 *data, size_t nblocks);
asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
const struct polyval_key *key,
const u8 *data, size_t nblocks);
}
}
+static void polyval_mul_x86(struct polyval_elem *a,
+ const struct polyval_elem *b)
+{
+ if (static_branch_likely(&have_pclmul) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ if (static_branch_likely(&have_pclmul_avx))
+ polyval_mul_pclmul_avx(a, b);
+ else
+ polyval_mul_pclmul(a, b);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_generic(a, b);
+ }
+}
+
+#define ghash_mul_arch ghash_mul_arch
+static void ghash_mul_arch(struct polyval_elem *acc,
+ const struct ghash_key *key)
+{
+ polyval_mul_x86(acc, &key->h);
+}
+
#define polyval_mul_arch polyval_mul_arch
static void polyval_mul_arch(struct polyval_elem *acc,
const struct polyval_key *key)
{
- if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
- kernel_fpu_begin();
- polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
- kernel_fpu_end();
+ polyval_mul_x86(acc, &key->h_powers[NUM_H_POWERS - 1]);
+}
+
+#define ghash_blocks_arch ghash_blocks_arch
+static void ghash_blocks_arch(struct polyval_elem *acc,
+ const struct ghash_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pclmul) && irq_fpu_usable()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / GHASH_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ ghash_blocks_pclmul(acc, &key->h, data, n);
+ kernel_fpu_end();
+ data += n * GHASH_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
} else {
- polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ ghash_blocks_generic(acc, &key->h, data, nblocks);
}
}
#define gf128hash_mod_init_arch gf128hash_mod_init_arch
static void gf128hash_mod_init_arch(void)
{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
- boot_cpu_has(X86_FEATURE_AVX))
- static_branch_enable(&have_pclmul_avx);
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmul);
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ static_branch_enable(&have_pclmul_avx);
+ }
}
.Lbswap_mask:
.octa 0x000102030405060708090a0b0c0d0e0f
-#define DATA %xmm0
-#define SHASH %xmm1
+#define ACC %xmm0
+#define KEY %xmm1
#define T1 %xmm2
#define T2 %xmm3
#define T3 %xmm4
/*
* __clmul_gf128mul_ble: internal ABI
* input:
- * DATA: operand1
- * SHASH: operand2, hash_key << 1 mod poly
+ * ACC: operand1
+ * KEY: operand2, hash_key << 1 mod poly
* output:
- * DATA: operand1 * operand2 mod poly
+ * ACC: operand1 * operand2 mod poly
* changed:
* T1
* T2
* T3
*/
SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
- movaps DATA, T1
- pshufd $0b01001110, DATA, T2
- pshufd $0b01001110, SHASH, T3
- pxor DATA, T2
- pxor SHASH, T3
+ movaps ACC, T1
+ pshufd $0b01001110, ACC, T2
+ pshufd $0b01001110, KEY, T3
+ pxor ACC, T2
+ pxor KEY, T3
- pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0
- pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1
+ pclmulqdq $0x00, KEY, ACC # ACC = a0 * b0
+ pclmulqdq $0x11, KEY, T1 # T1 = a1 * b1
pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0)
- pxor DATA, T2
+ pxor ACC, T2
pxor T1, T2 # T2 = a0 * b1 + a1 * b0
movaps T2, T3
pslldq $8, T3
psrldq $8, T2
- pxor T3, DATA
- pxor T2, T1 # <T1:DATA> is result of
+ pxor T3, ACC
+ pxor T2, T1 # <T1:ACC> is result of
# carry-less multiplication
# first phase of the reduction
- movaps DATA, T3
+ movaps ACC, T3
psllq $1, T3
- pxor DATA, T3
+ pxor ACC, T3
psllq $5, T3
- pxor DATA, T3
+ pxor ACC, T3
psllq $57, T3
movaps T3, T2
pslldq $8, T2
psrldq $8, T3
- pxor T2, DATA
+ pxor T2, ACC
pxor T3, T1
# second phase of the reduction
- movaps DATA, T2
+ movaps ACC, T2
psrlq $5, T2
- pxor DATA, T2
+ pxor ACC, T2
psrlq $1, T2
- pxor DATA, T2
+ pxor ACC, T2
psrlq $1, T2
pxor T2, T1
- pxor T1, DATA
+ pxor T1, ACC
RET
SYM_FUNC_END(__clmul_gf128mul_ble)
-/* void clmul_ghash_mul(char *dst, const le128 *shash) */
-SYM_FUNC_START(clmul_ghash_mul)
+/*
+ * void polyval_mul_pclmul(struct polyval_elem *a,
+ * const struct polyval_elem *b)
+ */
+SYM_FUNC_START(polyval_mul_pclmul)
FRAME_BEGIN
- movups (%rdi), DATA
- movups (%rsi), SHASH
- movaps .Lbswap_mask(%rip), BSWAP
- pshufb BSWAP, DATA
+ movups (%rdi), ACC
+ movups (%rsi), KEY
call __clmul_gf128mul_ble
- pshufb BSWAP, DATA
- movups DATA, (%rdi)
+ movups ACC, (%rdi)
FRAME_END
RET
-SYM_FUNC_END(clmul_ghash_mul)
+SYM_FUNC_END(polyval_mul_pclmul)
/*
- * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- * const le128 *shash);
+ * void ghash_blocks_pclmul(struct polyval_elem *acc,
+ * const struct polyval_elem *key,
+ * const u8 *data, size_t nblocks)
*/
-SYM_FUNC_START(clmul_ghash_update)
+SYM_FUNC_START(ghash_blocks_pclmul)
FRAME_BEGIN
- cmp $16, %rdx
- jb .Lupdate_just_ret # check length
movaps .Lbswap_mask(%rip), BSWAP
- movups (%rdi), DATA
- movups (%rcx), SHASH
- pshufb BSWAP, DATA
+ movups (%rdi), ACC
+ movups (%rsi), KEY
.align 4
-.Lupdate_loop:
- movups (%rsi), IN1
+.Lnext_block:
+ movups (%rdx), IN1
pshufb BSWAP, IN1
- pxor IN1, DATA
+ pxor IN1, ACC
call __clmul_gf128mul_ble
- sub $16, %rdx
- add $16, %rsi
- cmp $16, %rdx
- jge .Lupdate_loop
- pshufb BSWAP, DATA
- movups DATA, (%rdi)
-.Lupdate_just_ret:
- mov %rdx, %rax
+ add $16, %rdx
+ dec %rcx
+ jnz .Lnext_block
+ movups ACC, (%rdi)
FRAME_END
RET
-SYM_FUNC_END(clmul_ghash_update)
+SYM_FUNC_END(ghash_blocks_pclmul)