]> git.ipfire.org Git - thirdparty/tor.git/commitdiff
polyval: Detect pclmul presence using cpuid.
authorNick Mathewson <nickm@torproject.org>
Sat, 26 Apr 2025 02:04:23 +0000 (22:04 -0400)
committerNick Mathewson <nickm@torproject.org>
Wed, 21 May 2025 17:00:03 +0000 (13:00 -0400)
src/ext/polyval/ctmul64.c
src/ext/polyval/pclmul.c
src/ext/polyval/polyval.c
src/ext/polyval/polyval.h
src/lib/crypt_ops/crypto_init.c

index 7cf947b62a80f44ea84792b37f4cef9c381c406b..297a105bf9cc6515600018bef8696d87adba7f9d 100644 (file)
@@ -78,10 +78,10 @@ pv_mul_y_h_ctmul64(polyval_t *pv)
        uint64_t y0, y1;
        uint64_t h0, h1, h2, h0r, h1r, h2r;
 
-       y0 = pv->y.lo;
-       y1 = pv->y.hi;
-       h0 = pv->key.h.lo;
-       h1 = pv->key.h.hi;
+       y0 = CTMUL64_MEMBER(pv->y).lo;
+       y1 = CTMUL64_MEMBER(pv->y).hi;
+       h0 = CTMUL64_MEMBER(pv->key.h).lo;
+       h1 = CTMUL64_MEMBER(pv->key.h).hi;
        h0r = rev64(h0);
        h1r = rev64(h1);
 
@@ -127,7 +127,7 @@ pv_mul_y_h_ctmul64(polyval_t *pv)
                v3 ^= v1 ^ (v1 >> 1) ^ (v1 >> 2) ^ (v1 >> 7);
                v2 ^= (v1 << 63) ^ (v1 << 62) ^ (v1 << 57);
 
-               pv->y.lo = v2;
-               pv->y.hi = v3;
+               CTMUL64_MEMBER(pv->y).lo = v2;
+               CTMUL64_MEMBER(pv->y).hi = v3;
        }
 }
index 9e52705bc34f22cdf0d54998dff28e9702c29fa8..47a786b6b862f0d0cf260038253e30e6a87957dd 100644 (file)
@@ -156,14 +156,14 @@ void pv_mul_y_h_pclmul(polyval_t *pv)
 {
        __m128i yw, h1w, h1x;
 
-        h1w = pv->key.h;
+        h1w = PCLMUL_MEMBER(pv->key.h);
         BK(h1w, h1x);
 
         {
                __m128i aw, ax;
                __m128i t0, t1, t2, t3;
 
-                aw = pv->y;
+                aw = PCLMUL_MEMBER(pv->y);
                BK(aw, ax);
 
                t1 = pclmulqdq11(aw, h1w);
@@ -180,5 +180,5 @@ void pv_mul_y_h_pclmul(polyval_t *pv)
                yw = _mm_unpacklo_epi64(t1, t0);
        }
 
-       pv->y = yw;
+       PCLMUL_MEMBER(pv->y) = yw;
 }
index ee8aa5d1d5a3760e7d59910da31b200da853927f..c2b4d0c383f6e6845944a0212917c914d3d30ef5 100644 (file)
 
 #include <string.h>
 
+#ifdef PV_USE_PCLMUL_DETECT
+#include <cpuid.h>
+#endif
+
 typedef pv_u128_ u128;
 
 /* ========
@@ -73,7 +77,7 @@ static inline void pv_xor_y(polyval_t *, u128 v);
  *
  * (This is a carryless multiply in the Polyval galois field)
  */
-static void pv_mul_y_h(polyval_t *);
+static void pv_mul_y_h(polyval_t *);h
 #endif
 
 /* =====
@@ -118,54 +122,73 @@ bswap32(uint64_t v)
 #define convert_byte_order32(x) (x)
 #endif
 
-#ifdef PV_USE_PCLMUL
+#if defined PV_USE_PCLMUL_UNCONDITIONAL
+#define PCLMUL_MEMBER(v) (v)
+#define PV_USE_PCLMUL
+
+#elif defined PV_USE_PCLMUL_DETECT
+#define PCLMUL_MEMBER(v) (v).u128x1
+#define CTMUL64_MEMBER(v) (v).u64x2
+#define PV_USE_PCLMUL
+#define PV_USE_CTMUL64
 
+#elif defined PV_USE_CTMUL64
+#define CTMUL64_MEMBER(v) (v)
+#endif
+
+#ifdef PV_USE_PCLMUL
 #include "ext/polyval/pclmul.c"
 
 static inline u128
 u128_from_bytes_pclmul(const uint8_t *bytes)
 {
-  return _mm_loadu_si128((const u128*)bytes);
+  u128 r;
+  PCLMUL_MEMBER(r) = _mm_loadu_si128((const __m128i*)bytes);
+  return r;
 }
 static inline void
 u128_to_bytes_pclmul(u128 val, uint8_t *bytes_out)
 {
-  _mm_storeu_si128((u128*)bytes_out, val);
+  _mm_storeu_si128((__m128i*)bytes_out, PCLMUL_MEMBER(val));
 }
 static inline void
 pv_xor_y_pclmul(polyval_t *pv, u128 v)
 {
-  pv->y = _mm_xor_si128(pv->y, v);
+  PCLMUL_MEMBER(pv->y) = _mm_xor_si128(PCLMUL_MEMBER(pv->y),
+                                       PCLMUL_MEMBER(v));
 }
-#elif defined(PV_USE_CTMUL64)
+#endif
 
+#if defined(PV_USE_CTMUL64)
 #include "ext/polyval/ctmul64.c"
 
 static inline u128
 u128_from_bytes_ctmul64(const uint8_t *bytes)
 {
   u128 r;
-  memcpy(&r.lo, bytes, 8);
-  memcpy(&r.hi, bytes + 8, 8);
-  r.lo = convert_byte_order64(r.lo);
-  r.hi = convert_byte_order64(r.hi);
+  memcpy(&CTMUL64_MEMBER(r).lo, bytes, 8);
+  memcpy(&CTMUL64_MEMBER(r).hi, bytes + 8, 8);
+  CTMUL64_MEMBER(r).lo = convert_byte_order64(CTMUL64_MEMBER(r).lo);
+  CTMUL64_MEMBER(r).hi = convert_byte_order64(CTMUL64_MEMBER(r).hi);
   return r;
 }
 static inline void
 u128_to_bytes_ctmul64(u128 val, uint8_t *bytes_out)
 {
-  uint64_t lo = convert_byte_order64(val.lo);
-  uint64_t hi = convert_byte_order64(val.hi);
+  uint64_t lo = convert_byte_order64(CTMUL64_MEMBER(val).lo);
+  uint64_t hi = convert_byte_order64(CTMUL64_MEMBER(val).hi);
   memcpy(bytes_out, &lo, 8);
   memcpy(bytes_out + 8, &hi, 8);
 }
 static inline void
 pv_xor_y_ctmul64(polyval_t *pv, u128 val)
 {
-  pv->y.lo ^= val.lo;
-  pv->y.hi ^= val.hi;
+  CTMUL64_MEMBER(pv->y).lo ^= CTMUL64_MEMBER(val).lo;
+  CTMUL64_MEMBER(pv->y).hi ^= CTMUL64_MEMBER(val).hi;
 }
-#elif defined(PV_USE_CTMUL)
+#endif
+
+#if defined(PV_USE_CTMUL)
 #include "ext/polyval/ctmul.c"
 
 static inline u128
@@ -252,7 +275,85 @@ pv_xor_y_ctmul(polyval_t *pv, u128 val)
     memset(&pv->y, 0, sizeof(u128));                                    \
   }
 
-#ifdef PV_USE_PCLMUL
+#ifdef PV_USE_PCLMUL_DETECT
+/* We use a boolean to distinguish whether to use the PCLMUL instructions,
+ * but instead we could use function pointers.  It's probably worth
+ * benchmarking, though it's unlikely to make a measurable difference.
+ */
+static bool use_pclmul = false;
+
+/* Declare _both_ variations of our code, statically,
+ * with different prefixes. */
+PV_DECLARE(pclmul_, static,
+           u128_from_bytes_pclmul,
+           u128_to_bytes_pclmul,
+           pv_xor_y_pclmul,
+           pv_mul_y_h_pclmul)
+
+PV_DECLARE(ctmul64_, static,
+           u128_from_bytes_ctmul64,
+           u128_to_bytes_ctmul64,
+           pv_xor_y_ctmul64,
+           pv_mul_y_h_ctmul64)
+
+void
+polyval_key_init(polyval_key_t *pv, const uint8_t *key)
+{
+  if (use_pclmul)
+    pclmul_polyval_key_init(pv, key);
+  else
+    ctmul64_polyval_key_init(pv, key);
+}
+void
+polyval_init(polyval_t *pv, const uint8_t *key)
+{
+  if (use_pclmul)
+    pclmul_polyval_init(pv, key);
+  else
+    ctmul64_polyval_init(pv, key);
+}
+void
+polyval_init_from_key(polyval_t *pv, const polyval_key_t *key)
+{
+  if (use_pclmul)
+    pclmul_polyval_init_from_key(pv, key);
+  else
+    ctmul64_polyval_init_from_key(pv, key);
+}
+void
+polyval_add_block(polyval_t *pv, const uint8_t *block)
+{
+  if (use_pclmul)
+    pclmul_polyval_add_block(pv, block);
+  else
+    ctmul64_polyval_add_block(pv, block);
+}
+void
+polyval_add_zpad(polyval_t *pv, const uint8_t *data, size_t n)
+{
+  if (use_pclmul)
+    pclmul_polyval_add_zpad(pv, data, n);
+  else
+    ctmul64_polyval_add_zpad(pv, data, n);
+}
+void
+polyval_get_tag(const polyval_t *pv, uint8_t *tag_out)
+{
+  if (use_pclmul)
+    pclmul_polyval_get_tag(pv, tag_out);
+  else
+    ctmul64_polyval_get_tag(pv, tag_out);
+}
+void
+polyval_reset(polyval_t *pv)
+{
+  if (use_pclmul)
+    pclmul_polyval_reset(pv);
+  else
+    ctmul64_polyval_reset(pv);
+}
+
+#elif defined(PV_USE_PCLMUL)
 PV_DECLARE(, ,
            u128_from_bytes_pclmul,
            u128_to_bytes_pclmul,
@@ -271,7 +372,25 @@ PV_DECLARE(, , u128_from_bytes_ctmul,
            u128_to_bytes_ctmul,
            pv_xor_y_ctmul,
            pv_mul_y_h_ctmul)
+#endif
 
+#ifdef PV_USE_PCLMUL_DETECT
+void
+polyval_detect_implementation(void)
+{
+  unsigned int eax, ebc, ecx, edx;
+  use_pclmul = false;
+  if (__get_cpuid(1, &eax, &ebc, &ecx, &edx)) {
+    if (0 != (ecx & (1<<1))) {
+      use_pclmul = true;
+    }
+  }
+}
+#else
+void
+polyval_detect_implementation(void)
+{
+}
 #endif
 
 #if 0
index a0e71e81bd328c38663a48279462b1444aca888a..a7cd8b38694e066c20d49bc7f25be8b1d1072e86 100644 (file)
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) \
   || defined(_M_X64) || defined(_M_IX86) || defined(__i486)       \
   || defined(__i386__)
-/* Use intel intrinsics for carryless multiply.
- *
- * TODO: In theory we should detect whether we have the relevant instructions,
- * but they are all at least 15 years old.
+#define PV_INTEL_ARCH
+#endif
+
+#if defined(PV_INTEL_ARCH) && defined(__PCLMUL__)
+/* We're building for an architecture that always has the intel
+ * intrinsics for carryless multiply.
+ * No need for runtime detection.
+ */
+#define PV_USE_PCLMUL_UNCONDITIONAL
+
+#elif defined(PV_INTEL_ARCH) && SIZEOF_VOID_P >= 8
+/* We _might_ have PCLMUL, or we might not.
+ * We need to detect it at runtime.
  */
-#define PV_USE_PCLMUL
+#define PV_USE_PCLMUL_DETECT
+
 #elif SIZEOF_VOID_P >= 8
 /* It's a 64-bit architecture; use the generic 64-bit constant-time
  * implementation.
  * Declare a 128 bit integer type.
  # The exact representation will depend on which implementation we've chosen.
  */
-#ifdef PV_USE_PCLMUL
+#if defined(PV_USE_PCLMUL_UNCONDITIONAL)
 #include <emmintrin.h>
 typedef __m128i pv_u128_;
+#elif defined(PV_USE_PCLMUL_DETECT)
+#include <emmintrin.h>
+typedef union pv_u128_ {
+  __m128i u128x1;
+  struct {
+    uint64_t lo;
+    uint64_t hi;
+  } u64x2;
+} pv_u128_;
 #elif defined(PV_USE_CTMUL64)
 typedef struct pv_u128_ {
   uint64_t lo;
@@ -117,4 +136,7 @@ void polyval_get_tag(const polyval_t *, uint8_t *tag_out);
  */
 void polyval_reset(polyval_t *);
 
+/** If a faster-than-default polyval implementation is available, use it. */
+void polyval_detect_implementation(void);
+
 #endif
index ef9908c893fbb2076b27c9a62967fb308a900bf3..f846ca0fef286df48954d9e6a70ceba0c62d47d2 100644 (file)
@@ -26,6 +26,7 @@
 #include "lib/crypt_ops/crypto_options_st.h"
 #include "lib/conf/conftypes.h"
 #include "lib/log/util_bug.h"
+#include "ext/polyval/polyval.h"
 
 #include "lib/subsys/subsys.h"
 
@@ -69,6 +70,8 @@ crypto_early_init(void)
     crypto_nss_early_init(0);
 #endif
 
+    polyval_detect_implementation();
+
     if (crypto_seed_rng() < 0)
       return -1;
     if (crypto_init_siphash_key() < 0)