From: Nick Mathewson Date: Sat, 26 Apr 2025 02:04:23 +0000 (-0400) Subject: polyval: Detect pclmul presence using cpuid. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=62c7101ef1a734635215fc9128dfb023c1a6ffb9;p=thirdparty%2Ftor.git polyval: Detect pclmul presence using cpuid. --- diff --git a/src/ext/polyval/ctmul64.c b/src/ext/polyval/ctmul64.c index 7cf947b62a..297a105bf9 100644 --- a/src/ext/polyval/ctmul64.c +++ b/src/ext/polyval/ctmul64.c @@ -78,10 +78,10 @@ pv_mul_y_h_ctmul64(polyval_t *pv) uint64_t y0, y1; uint64_t h0, h1, h2, h0r, h1r, h2r; - y0 = pv->y.lo; - y1 = pv->y.hi; - h0 = pv->key.h.lo; - h1 = pv->key.h.hi; + y0 = CTMUL64_MEMBER(pv->y).lo; + y1 = CTMUL64_MEMBER(pv->y).hi; + h0 = CTMUL64_MEMBER(pv->key.h).lo; + h1 = CTMUL64_MEMBER(pv->key.h).hi; h0r = rev64(h0); h1r = rev64(h1); @@ -127,7 +127,7 @@ pv_mul_y_h_ctmul64(polyval_t *pv) v3 ^= v1 ^ (v1 >> 1) ^ (v1 >> 2) ^ (v1 >> 7); v2 ^= (v1 << 63) ^ (v1 << 62) ^ (v1 << 57); - pv->y.lo = v2; - pv->y.hi = v3; + CTMUL64_MEMBER(pv->y).lo = v2; + CTMUL64_MEMBER(pv->y).hi = v3; } } diff --git a/src/ext/polyval/pclmul.c b/src/ext/polyval/pclmul.c index 9e52705bc3..47a786b6b8 100644 --- a/src/ext/polyval/pclmul.c +++ b/src/ext/polyval/pclmul.c @@ -156,14 +156,14 @@ void pv_mul_y_h_pclmul(polyval_t *pv) { __m128i yw, h1w, h1x; - h1w = pv->key.h; + h1w = PCLMUL_MEMBER(pv->key.h); BK(h1w, h1x); { __m128i aw, ax; __m128i t0, t1, t2, t3; - aw = pv->y; + aw = PCLMUL_MEMBER(pv->y); BK(aw, ax); t1 = pclmulqdq11(aw, h1w); @@ -180,5 +180,5 @@ void pv_mul_y_h_pclmul(polyval_t *pv) yw = _mm_unpacklo_epi64(t1, t0); } - pv->y = yw; + PCLMUL_MEMBER(pv->y) = yw; } diff --git a/src/ext/polyval/polyval.c b/src/ext/polyval/polyval.c index ee8aa5d1d5..c2b4d0c383 100644 --- a/src/ext/polyval/polyval.c +++ b/src/ext/polyval/polyval.c @@ -39,6 +39,10 @@ #include +#ifdef PV_USE_PCLMUL_DETECT +#include +#endif + typedef pv_u128_ u128; /* ======== @@ -73,7 +77,7 @@ static inline void pv_xor_y(polyval_t *, u128 v); * * (This is a carryless multiply in the Polyval galois field) */ -static void pv_mul_y_h(polyval_t *); +static void pv_mul_y_h(polyval_t *);h #endif /* ===== @@ -118,54 +122,73 @@ bswap32(uint64_t v) #define convert_byte_order32(x) (x) #endif -#ifdef PV_USE_PCLMUL +#if defined PV_USE_PCLMUL_UNCONDITIONAL +#define PCLMUL_MEMBER(v) (v) +#define PV_USE_PCLMUL + +#elif defined PV_USE_PCLMUL_DETECT +#define PCLMUL_MEMBER(v) (v).u128x1 +#define CTMUL64_MEMBER(v) (v).u64x2 +#define PV_USE_PCLMUL +#define PV_USE_CTMUL64 +#elif defined PV_USE_CTMUL64 +#define CTMUL64_MEMBER(v) (v) +#endif + +#ifdef PV_USE_PCLMUL #include "ext/polyval/pclmul.c" static inline u128 u128_from_bytes_pclmul(const uint8_t *bytes) { - return _mm_loadu_si128((const u128*)bytes); + u128 r; + PCLMUL_MEMBER(r) = _mm_loadu_si128((const __m128i*)bytes); + return r; } static inline void u128_to_bytes_pclmul(u128 val, uint8_t *bytes_out) { - _mm_storeu_si128((u128*)bytes_out, val); + _mm_storeu_si128((__m128i*)bytes_out, PCLMUL_MEMBER(val)); } static inline void pv_xor_y_pclmul(polyval_t *pv, u128 v) { - pv->y = _mm_xor_si128(pv->y, v); + PCLMUL_MEMBER(pv->y) = _mm_xor_si128(PCLMUL_MEMBER(pv->y), + PCLMUL_MEMBER(v)); } -#elif defined(PV_USE_CTMUL64) +#endif +#if defined(PV_USE_CTMUL64) #include "ext/polyval/ctmul64.c" static inline u128 u128_from_bytes_ctmul64(const uint8_t *bytes) { u128 r; - memcpy(&r.lo, bytes, 8); - memcpy(&r.hi, bytes + 8, 8); - r.lo = convert_byte_order64(r.lo); - r.hi = convert_byte_order64(r.hi); + memcpy(&CTMUL64_MEMBER(r).lo, bytes, 8); + memcpy(&CTMUL64_MEMBER(r).hi, bytes + 8, 8); + CTMUL64_MEMBER(r).lo = convert_byte_order64(CTMUL64_MEMBER(r).lo); + CTMUL64_MEMBER(r).hi = convert_byte_order64(CTMUL64_MEMBER(r).hi); return r; } static inline void u128_to_bytes_ctmul64(u128 val, uint8_t *bytes_out) { - uint64_t lo = convert_byte_order64(val.lo); - uint64_t hi = convert_byte_order64(val.hi); + uint64_t lo = convert_byte_order64(CTMUL64_MEMBER(val).lo); + uint64_t hi = convert_byte_order64(CTMUL64_MEMBER(val).hi); memcpy(bytes_out, &lo, 8); memcpy(bytes_out + 8, &hi, 8); } static inline void pv_xor_y_ctmul64(polyval_t *pv, u128 val) { - pv->y.lo ^= val.lo; - pv->y.hi ^= val.hi; + CTMUL64_MEMBER(pv->y).lo ^= CTMUL64_MEMBER(val).lo; + CTMUL64_MEMBER(pv->y).hi ^= CTMUL64_MEMBER(val).hi; } -#elif defined(PV_USE_CTMUL) +#endif + +#if defined(PV_USE_CTMUL) #include "ext/polyval/ctmul.c" static inline u128 @@ -252,7 +275,85 @@ pv_xor_y_ctmul(polyval_t *pv, u128 val) memset(&pv->y, 0, sizeof(u128)); \ } -#ifdef PV_USE_PCLMUL +#ifdef PV_USE_PCLMUL_DETECT +/* We use a boolean to distinguish whether to use the PCLMUL instructions, + * but instead we could use function pointers. It's probably worth + * benchmarking, though it's unlikely to make a measurable difference. + */ +static bool use_pclmul = false; + +/* Declare _both_ variations of our code, statically, + * with different prefixes. */ +PV_DECLARE(pclmul_, static, + u128_from_bytes_pclmul, + u128_to_bytes_pclmul, + pv_xor_y_pclmul, + pv_mul_y_h_pclmul) + +PV_DECLARE(ctmul64_, static, + u128_from_bytes_ctmul64, + u128_to_bytes_ctmul64, + pv_xor_y_ctmul64, + pv_mul_y_h_ctmul64) + +void +polyval_key_init(polyval_key_t *pv, const uint8_t *key) +{ + if (use_pclmul) + pclmul_polyval_key_init(pv, key); + else + ctmul64_polyval_key_init(pv, key); +} +void +polyval_init(polyval_t *pv, const uint8_t *key) +{ + if (use_pclmul) + pclmul_polyval_init(pv, key); + else + ctmul64_polyval_init(pv, key); +} +void +polyval_init_from_key(polyval_t *pv, const polyval_key_t *key) +{ + if (use_pclmul) + pclmul_polyval_init_from_key(pv, key); + else + ctmul64_polyval_init_from_key(pv, key); +} +void +polyval_add_block(polyval_t *pv, const uint8_t *block) +{ + if (use_pclmul) + pclmul_polyval_add_block(pv, block); + else + ctmul64_polyval_add_block(pv, block); +} +void +polyval_add_zpad(polyval_t *pv, const uint8_t *data, size_t n) +{ + if (use_pclmul) + pclmul_polyval_add_zpad(pv, data, n); + else + ctmul64_polyval_add_zpad(pv, data, n); +} +void +polyval_get_tag(const polyval_t *pv, uint8_t *tag_out) +{ + if (use_pclmul) + pclmul_polyval_get_tag(pv, tag_out); + else + ctmul64_polyval_get_tag(pv, tag_out); +} +void +polyval_reset(polyval_t *pv) +{ + if (use_pclmul) + pclmul_polyval_reset(pv); + else + ctmul64_polyval_reset(pv); +} + +#elif defined(PV_USE_PCLMUL) PV_DECLARE(, , u128_from_bytes_pclmul, u128_to_bytes_pclmul, @@ -271,7 +372,25 @@ PV_DECLARE(, , u128_from_bytes_ctmul, u128_to_bytes_ctmul, pv_xor_y_ctmul, pv_mul_y_h_ctmul) +#endif +#ifdef PV_USE_PCLMUL_DETECT +void +polyval_detect_implementation(void) +{ + unsigned int eax, ebc, ecx, edx; + use_pclmul = false; + if (__get_cpuid(1, &eax, &ebc, &ecx, &edx)) { + if (0 != (ecx & (1<<1))) { + use_pclmul = true; + } + } +} +#else +void +polyval_detect_implementation(void) +{ +} #endif #if 0 diff --git a/src/ext/polyval/polyval.h b/src/ext/polyval/polyval.h index a0e71e81bd..a7cd8b3869 100644 --- a/src/ext/polyval/polyval.h +++ b/src/ext/polyval/polyval.h @@ -16,12 +16,22 @@ #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) \ || defined(_M_X64) || defined(_M_IX86) || defined(__i486) \ || defined(__i386__) -/* Use intel intrinsics for carryless multiply. - * - * TODO: In theory we should detect whether we have the relevant instructions, - * but they are all at least 15 years old. +#define PV_INTEL_ARCH +#endif + +#if defined(PV_INTEL_ARCH) && defined(__PCLMUL__) +/* We're building for an architecture that always has the intel + * intrinsics for carryless multiply. + * No need for runtime detection. + */ +#define PV_USE_PCLMUL_UNCONDITIONAL + +#elif defined(PV_INTEL_ARCH) && SIZEOF_VOID_P >= 8 +/* We _might_ have PCLMUL, or we might not. + * We need to detect it at runtime. */ -#define PV_USE_PCLMUL +#define PV_USE_PCLMUL_DETECT + #elif SIZEOF_VOID_P >= 8 /* It's a 64-bit architecture; use the generic 64-bit constant-time * implementation. @@ -40,9 +50,18 @@ * Declare a 128 bit integer type. # The exact representation will depend on which implementation we've chosen. */ -#ifdef PV_USE_PCLMUL +#if defined(PV_USE_PCLMUL_UNCONDITIONAL) #include typedef __m128i pv_u128_; +#elif defined(PV_USE_PCLMUL_DETECT) +#include +typedef union pv_u128_ { + __m128i u128x1; + struct { + uint64_t lo; + uint64_t hi; + } u64x2; +} pv_u128_; #elif defined(PV_USE_CTMUL64) typedef struct pv_u128_ { uint64_t lo; @@ -117,4 +136,7 @@ void polyval_get_tag(const polyval_t *, uint8_t *tag_out); */ void polyval_reset(polyval_t *); +/** If a faster-than-default polyval implementation is available, use it. */ +void polyval_detect_implementation(void); + #endif diff --git a/src/lib/crypt_ops/crypto_init.c b/src/lib/crypt_ops/crypto_init.c index ef9908c893..f846ca0fef 100644 --- a/src/lib/crypt_ops/crypto_init.c +++ b/src/lib/crypt_ops/crypto_init.c @@ -26,6 +26,7 @@ #include "lib/crypt_ops/crypto_options_st.h" #include "lib/conf/conftypes.h" #include "lib/log/util_bug.h" +#include "ext/polyval/polyval.h" #include "lib/subsys/subsys.h" @@ -69,6 +70,8 @@ crypto_early_init(void) crypto_nss_early_init(0); #endif + polyval_detect_implementation(); + if (crypto_seed_rng() < 0) return -1; if (crypto_init_siphash_key() < 0)