We don't want to do this without pclmul, since it doesn't help in that case.
We don't want to do this unconditionally, since many of our polyval keys
are only used for 16 byte inputs.
(Yes, this makes a difference in practice!)
_mm_slli_epi64(x2, 57))); \
} while (0)
-#define PCLMUL_BLOCK_STRIDE 4
-struct expanded_key_pclmul {
- // powers of h in reverse order.
- // (in other words, contains
- // h^PCLMUL_BLOCK_STRIDE .. H^2, H^1
- __m128i k[PCLMUL_BLOCK_STRIDE];
-};
BR_TARGET("ssse3,pclmul")
static inline void
-expand_key_pclmul(const polyval_t *pv, struct expanded_key_pclmul *out)
+expand_key_pclmul(const polyval_t *pv, pv_expanded_key_t *out)
{
__m128i h1w, h1x;
__m128i lastw, lastx;
h1w = PCLMUL_MEMBER(pv->key.h);
BK(h1w, h1x);
- out->k[PCLMUL_BLOCK_STRIDE-1] = lastw = h1w;
+ lastw = h1w;
- for (int i = PCLMUL_BLOCK_STRIDE - 2; i >= 0; --i) {
+ for (int i = PV_BLOCK_STRIDE - 2; i >= 0; --i) {
BK(lastw, lastx);
t1 = pclmulqdq11(lastw, h1w);
static inline void
pv_add_multiple_pclmul(polyval_t *pv,
const uint8_t *input,
- const struct expanded_key_pclmul *expanded)
+ const pv_expanded_key_t *expanded)
{
__m128i t0, t1, t2, t3;
t2 = _mm_setzero_si128();
t3 = _mm_setzero_si128();
- for (int i = 0; i < PCLMUL_BLOCK_STRIDE; ++i, input += 16) {
+ for (int i = 0; i < PV_BLOCK_STRIDE; ++i, input += 16) {
__m128i aw = _mm_loadu_si128((void *)(input));
__m128i ax;
- __m128i hx;
+ __m128i hx, hw;
if (i == 0) {
aw = _mm_xor_si128(aw, PCLMUL_MEMBER(pv->y));
}
+ if (i == PV_BLOCK_STRIDE - 1) {
+ hw = PCLMUL_MEMBER(pv->key.h);
+ } else {
+ hw = expanded->k[i];
+ }
BK(aw, ax);
- BK(expanded->k[i], hx);
- t1 = _mm_xor_si128(t1,
- pclmulqdq11(aw, expanded->k[i]));
- t3 = _mm_xor_si128(t3,
- pclmulqdq00(aw, expanded->k[i]));
- t2 = _mm_xor_si128(t2,
- pclmulqdq00(ax, hx));
+ BK(hw, hx);
+ t1 = _mm_xor_si128(t1, pclmulqdq11(aw, hw));
+ t3 = _mm_xor_si128(t3, pclmulqdq00(aw, hw));
+ t2 = _mm_xor_si128(t2, pclmulqdq00(ax, hx));
}
t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
u128_to_bytes_pclmul,
pv_xor_y_pclmul,
pv_mul_y_h_pclmul,
- PCLMUL_BLOCK_STRIDE,
- struct expanded_key_pclmul,
+ PV_BLOCK_STRIDE,
+ pv_expanded_key_t,
expand_key_pclmul,
pv_add_multiple_pclmul)
u128_to_bytes_pclmul,
pv_xor_y_pclmul,
pv_mul_y_h_pclmul,
- PCLMUL_BLOCK_STRIDE,
- struct expanded_key_pclmul,
+ PV_BLOCK_STRIDE,
+ pv_expanded_key_t,
expand_key_pclmul,
pv_add_multiple_pclmul)
-
#elif defined(PV_USE_CTMUL64)
PV_DECLARE(, ,
u128_from_bytes_ctmul64,
}
#endif
+#ifdef POLYVAL_USE_EXPANDED_KEYS
+
+#ifdef PV_USE_PCLMUL_DETECT
+#define SHOULD_EXPAND() (use_pclmul)
+#else
+#define SHOULD_EXPAND() (1)
+#endif
+
+void
+polyvalx_init(polyvalx_t *pvx, const uint8_t *key)
+{
+ polyval_init(&pvx->pv, key);
+ if (SHOULD_EXPAND()) {
+ expand_key_pclmul(&pvx->pv, &pvx->expanded);
+ }
+}
+void
+polyvalx_init_from_key(polyvalx_t *pvx, const polyval_key_t *key)
+{
+ polyval_init_from_key(&pvx->pv, key);
+ if (SHOULD_EXPAND()) {
+ expand_key_pclmul(&pvx->pv, &pvx->expanded);
+ }
+}
+void
+polyvalx_add_block(polyvalx_t *pvx, const uint8_t *block)
+{
+ polyval_add_block(&pvx->pv, block);
+}
+void
+polyvalx_add_zpad(polyvalx_t *pvx, const uint8_t *data, size_t n)
+{
+ if (SHOULD_EXPAND() && n >= PV_BLOCK_STRIDE * 16) {
+ while (n > PV_BLOCK_STRIDE * 16) {
+ pv_add_multiple_pclmul(&pvx->pv, data, &pvx->expanded);
+ data += PV_BLOCK_STRIDE * 16;
+ n -= PV_BLOCK_STRIDE * 16;
+ }
+ }
+ while (n > 16) {
+ polyval_add_block(&pvx->pv, data);
+ data += 16;
+ n -= 16;
+ }
+ if (n) {
+ uint8_t block[16];
+ memset(&block, 0, sizeof(block));
+ memcpy(block, data, n);
+ polyval_add_block(&pvx->pv, block);
+ }
+}
+void
+polyvalx_get_tag(const polyvalx_t *pvx, uint8_t *tag_out)
+{
+ polyval_get_tag(&pvx->pv, tag_out);
+}
+void polyvalx_reset(polyvalx_t *pvx)
+{
+ polyval_reset(&pvx->pv);
+}
+#endif
+
#if 0
#include <stdio.h>
int
* No need for runtime detection.
*/
#define PV_USE_PCLMUL_UNCONDITIONAL
+#define PCLMUL_ANY
#elif defined(PV_INTEL_ARCH) && SIZEOF_VOID_P >= 8
/* We _might_ have PCLMUL, or we might not.
* We need to detect it at runtime.
*/
#define PV_USE_PCLMUL_DETECT
+#define PCLMUL_ANY
#elif SIZEOF_VOID_P >= 8
/* It's a 64-bit architecture; use the generic 64-bit constant-time
#error "sizeof(void*) is implausibly weird."
#endif
+#ifdef PCLMUL_ANY
+#include <emmintrin.h>
+
+#define POLYVAL_USE_EXPANDED_KEYS
+#endif
+
/**
* Declare a 128 bit integer type.
# The exact representation will depend on which implementation we've chosen.
*/
#if defined(PV_USE_PCLMUL_UNCONDITIONAL)
-#include <emmintrin.h>
typedef __m128i pv_u128_;
#elif defined(PV_USE_PCLMUL_DETECT)
-#include <emmintrin.h>
typedef union pv_u128_ {
__m128i u128x1;
struct {
/** If a faster-than-default polyval implementation is available, use it. */
void polyval_detect_implementation(void);
+#ifdef POLYVAL_USE_EXPANDED_KEYS
+/* These variations are as for polyval_\*, but they use pre-expanded keys.
+ * They're appropriate when you know a key is likely to get used more than once
+ * on a large input.
+ */
+
+/** How many blocks to handle at once with an expanded key */
+#define PV_BLOCK_STRIDE 8
+typedef struct pv_expanded_key_t {
+ // powers of h in reverse order, down to 2.
+ // (in other words, contains
+ // h^PCLMUL_BLOCK_STRIDE .. H^2)
+ __m128i k[PV_BLOCK_STRIDE-1];
+} pv_expanded_key_t;
+typedef struct polyvalx_t {
+ polyval_t pv;
+ pv_expanded_key_t expanded;
+} polyvalx_t;
+
+void polyvalx_init(polyvalx_t *, const uint8_t *key);
+void polyvalx_init_from_key(polyvalx_t *, const polyval_key_t *key);
+void polyvalx_add_block(polyvalx_t *, const uint8_t *block);
+void polyvalx_add_zpad(polyvalx_t *, const uint8_t *data, size_t n);
+void polyvalx_get_tag(const polyvalx_t *, uint8_t *tag_out);
+void polyvalx_reset(polyvalx_t *);
+
+#else
+#define polyvalx_t polyval_t
+#define polyvalx_key_init polyval_key_init
+#define polyvalx_init polyval_init
+#define polyvalx_init_from_key polyval_init_from_key
+#define polyvalx_add_block polyval_add_block
+#define polyvalx_add_zpad polyval_add_zpad
+#define polyvalx_get_tag polyval_get_tag
+#define polyvalx_reset polyval_reset
+#endif
+
#endif
bench_polyval(void)
{
polyval_t pv;
+ polyvalx_t pvx;
uint8_t key[16];
uint8_t input[512];
uint64_t start, end, cstart, cend;
printf("polyval (add 512): %.2f ns; %.2f cpb\n",
NANOCOUNT(start, end, iters),
cpb(cstart, cend, iters * 512));
+
+ polyvalx_init(&pvx, key);
+ start = perftime();
+ cstart = cycles();
+ for (int i = 0; i < iters; ++i) {
+ polyvalx_add_zpad(&pvx, input, 512);
+ }
+ cend = cycles();
+ end = perftime();
+ printf("polyval (add 512, pre-expanded key): %.2f ns; %.2f cpb\n",
+ NANOCOUNT(start, end, iters),
+ cpb(cstart, cend, iters * 512));
}
static void
polyval_get_tag(&pv, output2);
tt_mem_op(output, OP_EQ, output2, 16);
+ // Now the same with polyvalx.
+ polyvalx_t pvx;
+ polyvalx_init(&pvx, key);
+ polyvalx_add_zpad(&pvx, longer, 4090);
+ polyvalx_get_tag(&pvx, output2);
+ tt_mem_op(output, OP_EQ, output2, 16);
+
+ polyvalx_reset(&pvx);
+ for (cp = longer; cp < longer + 4096; cp += 16) {
+ polyvalx_add_block(&pvx, cp);
+ }
+ polyvalx_get_tag(&pvx, output2);
+ tt_mem_op(output, OP_EQ, output2, 16);
+
done:
tor_free(mem_op_hex_tmp);
tor_free(longer);