]> git.ipfire.org Git - thirdparty/tor.git/commitdiff
Speed up polyval through pipelining.
authorNick Mathewson <nickm@torproject.org>
Thu, 15 May 2025 14:00:21 +0000 (10:00 -0400)
committerNick Mathewson <nickm@torproject.org>
Wed, 21 May 2025 17:00:03 +0000 (13:00 -0400)
This optimization helps because:
  - We're not blocking the computation of each block on the computation of the
    previous one, which leads to fewer pipeline stalls.
  - We're deferring reduction until the end of handling a bunch of blocks.

src/ext/polyval/pclmul.c
src/ext/polyval/polyval.c
src/test/test_crypto.c

index 47a786b6b862f0d0cf260038253e30e6a87957dd..a441a7c1c62409a7809daf5e18ab4900e74ff1e4 100644 (file)
@@ -148,11 +148,85 @@ pclmulqdq11(__m128i x, __m128i y)
                                _mm_slli_epi64(x2, 57))); \
        } while (0)
 
+#define PCLMUL_BLOCK_STRIDE 4
+struct expanded_key_pclmul {
+       // powers of h in reverse order.
+       // (in other words, contains
+       // h^PCLMUL_BLOCK_STRIDE .. H^2, H^1
+       __m128i k[PCLMUL_BLOCK_STRIDE];
+};
+
+BR_TARGET("ssse3,pclmul")
+static inline void
+expand_key_pclmul(const polyval_t *pv, struct expanded_key_pclmul *out)
+{
+       __m128i h1w, h1x;
+       __m128i lastw, lastx;
+       __m128i t0, t1, t2, t3;
+
+       h1w = PCLMUL_MEMBER(pv->key.h);
+        BK(h1w, h1x);
+       out->k[PCLMUL_BLOCK_STRIDE-1] = lastw = h1w;
+
+       for (int i = PCLMUL_BLOCK_STRIDE - 2; i >= 0; --i) {
+               BK(lastw, lastx);
+
+               t1 = pclmulqdq11(lastw, h1w);
+               t3 = pclmulqdq00(lastw, h1w);
+               t2 = pclmulqdq00(lastx, h1x);
+               t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
+               t0 = _mm_shuffle_epi32(t1, 0x0E);
+               t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
+               t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
+               REDUCE_F128(t0, t1, t2, t3);
+               out->k[i] = lastw = _mm_unpacklo_epi64(t1, t0);
+       }
+}
+
+// Add PCLMUL_BLOCK_STRIDE * 16 bytes from input.
+BR_TARGET("ssse3,pclmul")
+static inline void
+pv_add_multiple_pclmul(polyval_t *pv,
+                      const uint8_t *input,
+                      const struct expanded_key_pclmul *expanded)
+{
+       __m128i t0, t1, t2, t3;
+
+       t1 = _mm_setzero_si128();
+       t2 = _mm_setzero_si128();
+       t3 = _mm_setzero_si128();
+
+       for (int i = 0; i < PCLMUL_BLOCK_STRIDE; ++i, input += 16) {
+               __m128i aw = _mm_loadu_si128((void *)(input));
+               __m128i ax;
+               __m128i hx;
+               if (i == 0) {
+                       aw = _mm_xor_si128(aw, PCLMUL_MEMBER(pv->y));
+               }
+               BK(aw, ax);
+               BK(expanded->k[i], hx);
+               t1 = _mm_xor_si128(t1,
+                                  pclmulqdq11(aw, expanded->k[i]));
+               t3 = _mm_xor_si128(t3,
+                                  pclmulqdq00(aw, expanded->k[i]));
+               t2 = _mm_xor_si128(t2,
+                                  pclmulqdq00(ax, hx));
+       }
+
+       t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
+       t0 = _mm_shuffle_epi32(t1, 0x0E);
+       t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
+       t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
+
+       REDUCE_F128(t0, t1, t2, t3);
+       PCLMUL_MEMBER(pv->y) = _mm_unpacklo_epi64(t1, t0);
+}
+
 
 /* see bearssl_hash.h */
 BR_TARGET("ssse3,pclmul")
-static
-void pv_mul_y_h_pclmul(polyval_t *pv)
+static inline void
+pv_mul_y_h_pclmul(polyval_t *pv)
 {
        __m128i yw, h1w, h1x;
 
index c2b4d0c383f6e6845944a0212917c914d3d30ef5..23ba8da9401647f30aa9fd3152b5ab6fe7761cca 100644 (file)
@@ -219,12 +219,30 @@ pv_xor_y_ctmul(polyval_t *pv, u128 val)
 }
 #endif
 
+struct expanded_key_none {};
+static inline void add_multiple_none(polyval_t *pv,
+                                     const uint8_t *input,
+                                     const struct expanded_key_none *expanded)
+{
+  (void) pv;
+  (void) input;
+  (void) expanded;
+}
+static inline void expand_key_none(const polyval_t *inp,
+                                   struct expanded_key_none *out)
+{
+  (void) inp;
+  (void) out;
+}
+
 #define PV_DECLARE(prefix,                                              \
                    st,                                                  \
                    u128_from_bytes,                                     \
                    u128_to_bytes,                                       \
                    pv_xor_y,                                            \
-                   pv_mul_y_h)                                          \
+                   pv_mul_y_h,                                          \
+                   block_stride,                                        \
+                   expanded_key_tp, expand_fn, add_multiple_fn)         \
   st void                                                               \
   prefix ## polyval_key_init(polyval_key_t *pvk, const uint8_t *key)    \
   {                                                                     \
@@ -252,6 +270,15 @@ pv_xor_y_ctmul(polyval_t *pv, u128 val)
   st void                                                               \
   prefix ## polyval_add_zpad(polyval_t *pv, const uint8_t *data, size_t n) \
   {                                                                     \
+    if (n > block_stride * 16) {                                        \
+      expanded_key_tp expanded_key;                                     \
+      expand_fn(pv, &expanded_key);                                     \
+      while (n > block_stride * 16) {                                   \
+        add_multiple_fn(pv, data, &expanded_key);                       \
+        n -= block_stride*16;                                           \
+        data += block_stride * 16;                                      \
+      }                                                                 \
+    }                                                                   \
     while (n > 16) {                                                    \
       polyval_add_block(pv, data);                                      \
       data += 16;                                                       \
@@ -288,13 +315,21 @@ PV_DECLARE(pclmul_, static,
            u128_from_bytes_pclmul,
            u128_to_bytes_pclmul,
            pv_xor_y_pclmul,
-           pv_mul_y_h_pclmul)
+           pv_mul_y_h_pclmul,
+           PCLMUL_BLOCK_STRIDE,
+           struct expanded_key_pclmul,
+           expand_key_pclmul,
+           pv_add_multiple_pclmul)
 
 PV_DECLARE(ctmul64_, static,
            u128_from_bytes_ctmul64,
            u128_to_bytes_ctmul64,
            pv_xor_y_ctmul64,
-           pv_mul_y_h_ctmul64)
+           pv_mul_y_h_ctmul64,
+           0,
+           struct expanded_key_none,
+           expand_key_none,
+           add_multiple_none)
 
 void
 polyval_key_init(polyval_key_t *pv, const uint8_t *key)
@@ -358,20 +393,32 @@ PV_DECLARE(, ,
            u128_from_bytes_pclmul,
            u128_to_bytes_pclmul,
            pv_xor_y_pclmul,
-           pv_mul_y_h_pclmul)
+           pv_mul_y_h_pclmul,
+           PCLMUL_BLOCK_STRIDE,
+           struct expanded_key_pclmul,
+           expand_key_pclmul,
+           pv_add_multiple_pclmul)
 
 #elif defined(PV_USE_CTMUL64)
 PV_DECLARE(, ,
            u128_from_bytes_ctmul64,
            u128_to_bytes_ctmul64,
            pv_xor_y_ctmul64,
-           pv_mul_y_h_ctmul64)
+           pv_mul_y_h_ctmul64,
+           0,
+           struct expanded_key_none,
+           expand_key_none,
+           add_multiple_none)
 
 #elif defined(PV_USE_CTMUL)
 PV_DECLARE(, , u128_from_bytes_ctmul,
            u128_to_bytes_ctmul,
            pv_xor_y_ctmul,
-           pv_mul_y_h_ctmul)
+           pv_mul_y_h_ctmul,
+           0,
+           struct expanded_key_none,
+           expand_key_none,
+           add_multiple_none)
 #endif
 
 #ifdef PV_USE_PCLMUL_DETECT
index 5b8d7f6db5717383fb77b984c8328ad8c04a02d9..f895eec891106b784319bcdc6c254435a5e69df7 100644 (file)
@@ -3201,6 +3201,7 @@ test_crypto_polyval(void *arg)
   uint8_t output[16];
   uint8_t output2[16];
   char *mem_op_hex_tmp=NULL;
+  uint8_t *longer = NULL;
 
   // From RFC 8452
   const char *key_hex = "25629347589242761d31f826ba4b757b";
@@ -3236,8 +3237,24 @@ test_crypto_polyval(void *arg)
   polyval_get_tag(&pv, output2);
   tt_mem_op(output, OP_EQ, output2, 16);
 
+  // Try a long input both ways, and make sure the answer is the same.
+  longer = tor_malloc_zero(4096);
+  crypto_rand((char *)longer, 4090); // leave zeros at the end.
+  polyval_reset(&pv);
+  polyval_add_zpad(&pv, longer, 4090);
+  polyval_get_tag(&pv, output);
+
+  polyval_reset(&pv);
+  const uint8_t *cp;
+  for (cp = longer; cp < longer + 4096; cp += 16) {
+    polyval_add_block(&pv, cp);
+  }
+  polyval_get_tag(&pv, output2);
+  tt_mem_op(output, OP_EQ, output2, 16);
+
  done:
   tor_free(mem_op_hex_tmp);
+  tor_free(longer);
 }
 
 static void