Adapt pclmul.c to work with polyval.c

author Nick Mathewson <nickm@torproject.org>

Sat, 19 Apr 2025 01:09:25 +0000 (21:09 -0400)

committer Nick Mathewson <nickm@torproject.org>

Wed, 21 May 2025 13:43:51 +0000 (09:43 -0400)
author Nick Mathewson <nickm@torproject.org>
Sat, 19 Apr 2025 01:09:25 +0000 (21:09 -0400)
committer Nick Mathewson <nickm@torproject.org>
Wed, 21 May 2025 13:43:51 +0000 (09:43 -0400)
diff --git a/src/ext/polyval/pclmul.c b/src/ext/polyval/pclmul.c

index a58e7dc02bebcfe855f384d937848d8189b25fc2..5a10c1e29a3c97c67ee9810b34b90d4e0eeaa47f 100644 (file)
--- a/src/ext/polyval/pclmul.c
+++ b/src/ext/polyval/pclmul.c
@@ -1,7 +1,7 @@
  /*
   * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   *
- * Permission is hereby granted, free of charge, to any person obtaining 
+ * Permission is hereby granted, free of charge, to any person obtaining
   * a copy of this software and associated documentation files (the
   * "Software"), to deal in the Software without restriction, including
   * without limitation the rights to use, copy, modify, merge, publish,
@@ -9,12 +9,12 @@
   * permit persons to whom the Software is furnished to do so, subject to
   * the following conditions:
   *
- * The above copyright notice and this permission notice shall be 
+ * The above copyright notice and this permission notice shall be
   * included in all copies or substantial portions of the Software.
   *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
@@ -22,16 +22,24 @@
   * SOFTWARE.
   */
  
-#define BR_ENABLE_INTRINSICS   1
-#include "inner.h"
-
  /*
   * This is the GHASH implementation that leverages the pclmulqdq opcode
   * (from the AES-NI instructions).
   */
  
-#if BR_AES_X86NI
+#include <wmmintrin.h>
+
+#ifndef __GNUC__
+#define __attribute__(x)
+#endif
  
+#define BR_TARGET(x) __attribute__((target(x)))
+
+#if defined(__GNUC__) && !defined(__clang__)
+        _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
+#endif
+
+#if 0
  /*
   * Test CPU support for PCLMULQDQ.
   */
@@ -53,64 +61,7 @@ br_ghash_pclmul_get(void)
  }
  
  BR_TARGETS_X86_UP
-
-/*
- * GHASH is defined over elements of GF(2^128) with "full little-endian"
- * representation: leftmost byte is least significant, and, within each
- * byte, leftmost _bit_ is least significant. The natural ordering in
- * x86 is "mixed little-endian": bytes are ordered from least to most
- * significant, but bits within a byte are in most-to-least significant
- * order. Going to full little-endian representation would require
- * reversing bits within each byte, which is doable but expensive.
- *
- * Instead, we go to full big-endian representation, by swapping bytes
- * around, which is done with a single _mm_shuffle_epi8() opcode (it
- * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
- * can use a full big-endian representation because in a carryless
- * multiplication, we have a nice bit reversal property:
- *
- *    rev_128(x) * rev_128(y) = rev_255(x * y)
- *
- * So by using full big-endian, we still get the right result, except
- * that it is right-shifted by 1 bit. The left-shift is relatively
- * inexpensive, and it can be mutualised.
- *
- *
- * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
- * values with bit precision, we have to break down values into 64-bit
- * chunks. We number chunks from 0 to 3 in left to right order.
- */
-
-/*
- * Byte-swap a complete 128-bit value. This normally uses
- * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
- * However, this crashes old Clang versions, so, for Clang before 3.8,
- * we use an alternate (and less efficient) version.
- */
-#if BR_CLANG && !BR_CLANG_3_8
-#define BYTESWAP_DECL
-#define BYTESWAP_PREP   (void)0
-#define BYTESWAP(x)   do { \
-               __m128i byteswap1, byteswap2; \
-               byteswap1 = (x); \
-               byteswap2 = _mm_srli_epi16(byteswap1, 8); \
-               byteswap1 = _mm_slli_epi16(byteswap1, 8); \
-               byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
-               byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
-               byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
-               (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
-       } while (0)
-#else
-#define BYTESWAP_DECL   __m128i byteswap_index;
-#define BYTESWAP_PREP   do { \
-               byteswap_index = _mm_set_epi8( \
-                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
-       } while (0)
-#define BYTESWAP(x)   do { \
-               (x) = _mm_shuffle_epi8((x), byteswap_index); \
-       } while (0)
  #endif
-
  /*
   * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
   * for that compiler, we use inline assembly. Inline assembly is
@@ -121,8 +72,8 @@ BR_TARGETS_X86_UP
   * We use a target of "sse2" only, so that Clang may still handle the
   * '__m128i' type and allocate SSE2 registers.
   */
-#if BR_CLANG
-BR_TARGET("sse2")
+#ifdef __clang__
+ BR_TARGET("sse2")
  static inline __m128i
  pclmulqdq00(__m128i x, __m128i y)
  {
@@ -158,22 +109,6 @@ pclmulqdq11(__m128i x, __m128i y)
                 kx = _mm_xor_si128(k0, k1); \
         } while (0)
  
-/*
- * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
- */
-#define SL_256(x0, x1, x2, x3)   do { \
-               x0 = _mm_or_si128( \
-                       _mm_slli_epi64(x0, 1), \
-                       _mm_srli_epi64(x1, 63)); \
-               x1 = _mm_or_si128( \
-                       _mm_slli_epi64(x1, 1), \
-                       _mm_srli_epi64(x2, 63)); \
-               x2 = _mm_or_si128( \
-                       _mm_slli_epi64(x2, 1), \
-                       _mm_srli_epi64(x3, 63)); \
-               x3 = _mm_slli_epi64(x3, 1); \
-       } while (0)
-
  /*
   * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
   * result is written in x0..x1.
@@ -213,150 +148,22 @@ pclmulqdq11(__m128i x, __m128i y)
                                 _mm_slli_epi64(x2, 57))); \
         } while (0)
  
-/*
- * Square value kw into (dw,dx).
- */
-#define SQUARE_F128(kw, dw, dx)   do { \
-               __m128i z0, z1, z2, z3; \
-               z1 = pclmulqdq11(kw, kw); \
-               z3 = pclmulqdq00(kw, kw); \
-               z0 = _mm_shuffle_epi32(z1, 0x0E); \
-               z2 = _mm_shuffle_epi32(z3, 0x0E); \
-               SL_256(z0, z1, z2, z3); \
-               REDUCE_F128(z0, z1, z2, z3); \
-               PBK(z0, z1, dw, dx); \
-       } while (0)
  
  /* see bearssl_hash.h */
  BR_TARGET("ssse3,pclmul")
-void
-br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
+static
+void pv_mul_y_h(polyval_t *pv)
  {
-       const unsigned char *buf1, *buf2;
-       unsigned char tmp[64];
-       size_t num4, num1;
         __m128i yw, h1w, h1x;
-       BYTESWAP_DECL
  
-       /*
-        * We split data into two chunks. First chunk starts at buf1
-        * and contains num4 blocks of 64-byte values. Second chunk
-        * starts at buf2 and contains num1 blocks of 16-byte values.
-        * We want the first chunk to be as large as possible.
-        */
-       buf1 = data;
-       num4 = len >> 6;
-       len &= 63;
-       buf2 = buf1 + (num4 << 6);
-       num1 = (len + 15) >> 4;
-       if ((len & 15) != 0) {
-               memcpy(tmp, buf2, len);
-               memset(tmp + len, 0, (num1 << 4) - len);
-               buf2 = tmp;
-       }
+        h1w = pv->h;
+        BK(h1w, h1x);
  
-       /*
-        * Preparatory step for endian conversions.
-        */
-       BYTESWAP_PREP;
-
-       /*
-        * Load y and h.
-        */
-       yw = _mm_loadu_si128(y);
-       h1w = _mm_loadu_si128(h);
-       BYTESWAP(yw);
-       BYTESWAP(h1w);
-       BK(h1w, h1x);
-
-       if (num4 > 0) {
-               __m128i h2w, h2x, h3w, h3x, h4w, h4x;
-               __m128i t0, t1, t2, t3;
-
-               /*
-                * Compute h2 = h^2.
-                */
-               SQUARE_F128(h1w, h2w, h2x);
-
-               /*
-                * Compute h3 = h^3 = h*(h^2).
-                */
-               t1 = pclmulqdq11(h1w, h2w);
-               t3 = pclmulqdq00(h1w, h2w);
-               t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
-                       _mm_xor_si128(t1, t3));
-               t0 = _mm_shuffle_epi32(t1, 0x0E);
-               t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
-               t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
-               SL_256(t0, t1, t2, t3);
-               REDUCE_F128(t0, t1, t2, t3);
-               PBK(t0, t1, h3w, h3x);
-
-               /*
-                * Compute h4 = h^4 = (h^2)^2.
-                */
-               SQUARE_F128(h2w, h4w, h4x);
-
-               while (num4 -- > 0) {
-                       __m128i aw0, aw1, aw2, aw3;
-                       __m128i ax0, ax1, ax2, ax3;
-
-                       aw0 = _mm_loadu_si128((void *)(buf1 +  0));
-                       aw1 = _mm_loadu_si128((void *)(buf1 + 16));
-                       aw2 = _mm_loadu_si128((void *)(buf1 + 32));
-                       aw3 = _mm_loadu_si128((void *)(buf1 + 48));
-                       BYTESWAP(aw0);
-                       BYTESWAP(aw1);
-                       BYTESWAP(aw2);
-                       BYTESWAP(aw3);
-                       buf1 += 64;
-
-                       aw0 = _mm_xor_si128(aw0, yw);
-                       BK(aw1, ax1);
-                       BK(aw2, ax2);
-                       BK(aw3, ax3);
-                       BK(aw0, ax0);
-
-                       t1 = _mm_xor_si128(
-                               _mm_xor_si128(
-                                       pclmulqdq11(aw0, h4w),
-                                       pclmulqdq11(aw1, h3w)),
-                               _mm_xor_si128(
-                                       pclmulqdq11(aw2, h2w),
-                                       pclmulqdq11(aw3, h1w)));
-                       t3 = _mm_xor_si128(
-                               _mm_xor_si128(
-                                       pclmulqdq00(aw0, h4w),
-                                       pclmulqdq00(aw1, h3w)),
-                               _mm_xor_si128(
-                                       pclmulqdq00(aw2, h2w),
-                                       pclmulqdq00(aw3, h1w)));
-                       t2 = _mm_xor_si128(
-                               _mm_xor_si128(
-                                       pclmulqdq00(ax0, h4x),
-                                       pclmulqdq00(ax1, h3x)),
-                               _mm_xor_si128(
-                                       pclmulqdq00(ax2, h2x),
-                                       pclmulqdq00(ax3, h1x)));
-                       t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
-                       t0 = _mm_shuffle_epi32(t1, 0x0E);
-                       t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
-                       t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
-                       SL_256(t0, t1, t2, t3);
-                       REDUCE_F128(t0, t1, t2, t3);
-                       yw = _mm_unpacklo_epi64(t1, t0);
-               }
-       }
-
-       while (num1 -- > 0) {
+        {
                 __m128i aw, ax;
                 __m128i t0, t1, t2, t3;
  
-               aw = _mm_loadu_si128((void *)buf2);
-               BYTESWAP(aw);
-               buf2 += 16;
-
-               aw = _mm_xor_si128(aw, yw);
+                aw = pv->y;
                 BK(aw, ax);
  
                 t1 = pclmulqdq11(aw, h1w);
@@ -366,24 +173,12 @@ br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
                 t0 = _mm_shuffle_epi32(t1, 0x0E);
                 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
                 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
+#if 0 // This step is GHASH-only.
                 SL_256(t0, t1, t2, t3);
+#endif
                 REDUCE_F128(t0, t1, t2, t3);
                 yw = _mm_unpacklo_epi64(t1, t0);
         }
  
-       BYTESWAP(yw);
-       _mm_storeu_si128(y, yw);
-}
-
-BR_TARGETS_X86_DOWN
-
-#else
-
-/* see bearssl_hash.h */
-br_ghash
-br_ghash_pclmul_get(void)
-{
-       return 0;
+       pv->y = yw;
  }
-
-#endif
diff --git a/src/ext/polyval/polyval.c b/src/ext/polyval/polyval.c

index 9140b9457e78e4aff6aa4d394af6fdeba895047e..7c6df7ab082888f3d5020801e849ee60cf5e32a2 100644 (file)
--- a/src/ext/polyval/polyval.c
+++ b/src/ext/polyval/polyval.c
@@ -12,10 +12,10 @@
  
  #include <string.h>
  
-typedef struct pv_u128_ u128;
+typedef pv_u128_ u128;
  
  static inline u128 u128_from_bytes(const uint8_t *bytes);
-static inline void u128_to_bytes(u128 u128, uint8_t *bytes_out);
+static inline void u128_to_bytes(u128, uint8_t *bytes_out);
  static inline void pv_xor(polyval_t *, u128);
  static inline void pv_init_extra(polyval_t *pv);
  
@@ -23,8 +23,7 @@ static inline void pv_init_extra(polyval_t *pv);
  /**
   * Within the polyval struct, perform "y *= h".
   */
-static
-void pv_mul_y_h(polyval_t *);
+static void pv_mul_y_h(polyval_t *);
  
  #ifdef WORDS_BIG_ENDIAN
  #ifdef __GNUC__
@@ -64,7 +63,32 @@ bswap32(uint64_t v)
  #define convert_byte_order32(x) (x)
  #endif
  
-#ifdef PV_USE_CTMUL64
+
+#ifdef PV_USE_PCLMUL
+
+#include "ext/polyval/pclmul.c"
+
+static inline u128
+u128_from_bytes(const uint8_t *bytes)
+{
+  return _mm_loadu_si128((const u128*)bytes);
+}
+static inline void
+u128_to_bytes(u128 val, uint8_t *bytes_out)
+{
+  _mm_storeu_si128((u128*)bytes_out, val);
+}
+static inline void
+pv_xor(polyval_t *pv, u128 v)
+{
+  pv->y = _mm_xor_si128(pv->y, v);
+}
+static inline void
+pv_init_extra(polyval_t *pv)
+{
+  (void)pv;
+}
+#elif defined(PV_USE_CTMUL64)
  
  #include "ext/polyval/ctmul64.c"
  
diff --git a/src/ext/polyval/polyval.h b/src/ext/polyval/polyval.h

index 4e21fcbb85af4ef7acf08df9229da3f6e069e803..8ad48d877068a51aa73bf42ac102ae1fbfeb711f 100644 (file)
--- a/src/ext/polyval/polyval.h
+++ b/src/ext/polyval/polyval.h
@@ -12,18 +12,21 @@
  #include "orconfig.h"
  #include "lib/cc/torint.h"
  
-#define PV_USE_CTMUL
+#define PV_USE_PCLMUL
  
-#ifdef PV_USE_CTMUL64
+#ifdef PV_USE_PCLMUL
+#include <emmintrin.h>
+typedef __m128i pv_u128_;
+#elif defined(PV_USE_CTMUL64)
  /** A 128-bit integer represented as its low and high portion. */
-struct pv_u128_ {
+typedef struct pv_u128_ {
    uint64_t lo;
    uint64_t hi;
  } pv_u128_;
  #elif defined(PV_USE_CTMUL)
-struct pv_u128_ {
+typedef struct pv_u128_ {
    uint32_t v[4];
-};
+} pv_u128_;
  #endif
  
  /**
@@ -31,14 +34,14 @@ struct pv_u128_ {
   **/
  typedef struct polyval_t {
    /** The key itself. */
-  struct pv_u128_ h;
+  pv_u128_ h;
  #ifdef PV_USE_CTMUL64
    /** The elements of the key in bit-reversed form.
     * (Used as an optimization.) */
-  struct pv_u128_ hr;
+  pv_u128_ hr;
  #endif
    /** The accumulator */
-  struct pv_u128_ y;
+  pv_u128_ y;
  } polyval_t;
  
  #define POLYVAL_KEY_LEN 16
author	Nick Mathewson <nickm@torproject.org>
	Sat, 19 Apr 2025 01:09:25 +0000 (21:09 -0400)
committer	Nick Mathewson <nickm@torproject.org>
	Wed, 21 May 2025 13:43:51 +0000 (09:43 -0400)
src/ext/polyval/pclmul.c		patch \| blob \| blame \| history
src/ext/polyval/polyval.c		patch \| blob \| blame \| history
src/ext/polyval/polyval.h		patch \| blob \| blame \| history