From cbeb8ac87a848bc2aa5df1dbade7728c327bc5fa Mon Sep 17 00:00:00 2001 From: Nick Mathewson Date: Fri, 18 Apr 2025 21:09:25 -0400 Subject: [PATCH] Adapt pclmul.c to work with polyval.c --- src/ext/polyval/pclmul.c | 259 ++++---------------------------------- src/ext/polyval/polyval.c | 34 ++++- src/ext/polyval/polyval.h | 19 +-- 3 files changed, 67 insertions(+), 245 deletions(-) diff --git a/src/ext/polyval/pclmul.c b/src/ext/polyval/pclmul.c index a58e7dc02b..5a10c1e29a 100644 --- a/src/ext/polyval/pclmul.c +++ b/src/ext/polyval/pclmul.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2017 Thomas Pornin * - * Permission is hereby granted, free of charge, to any person obtaining + * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, @@ -9,12 +9,12 @@ * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * - * The above copyright notice and this permission notice shall be + * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN @@ -22,16 +22,24 @@ * SOFTWARE. */ -#define BR_ENABLE_INTRINSICS 1 -#include "inner.h" - /* * This is the GHASH implementation that leverages the pclmulqdq opcode * (from the AES-NI instructions). */ -#if BR_AES_X86NI +#include + +#ifndef __GNUC__ +#define __attribute__(x) +#endif +#define BR_TARGET(x) __attribute__((target(x))) + +#if defined(__GNUC__) && !defined(__clang__) + _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")") +#endif + +#if 0 /* * Test CPU support for PCLMULQDQ. */ @@ -53,64 +61,7 @@ br_ghash_pclmul_get(void) } BR_TARGETS_X86_UP - -/* - * GHASH is defined over elements of GF(2^128) with "full little-endian" - * representation: leftmost byte is least significant, and, within each - * byte, leftmost _bit_ is least significant. The natural ordering in - * x86 is "mixed little-endian": bytes are ordered from least to most - * significant, but bits within a byte are in most-to-least significant - * order. Going to full little-endian representation would require - * reversing bits within each byte, which is doable but expensive. - * - * Instead, we go to full big-endian representation, by swapping bytes - * around, which is done with a single _mm_shuffle_epi8() opcode (it - * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We - * can use a full big-endian representation because in a carryless - * multiplication, we have a nice bit reversal property: - * - * rev_128(x) * rev_128(y) = rev_255(x * y) - * - * So by using full big-endian, we still get the right result, except - * that it is right-shifted by 1 bit. The left-shift is relatively - * inexpensive, and it can be mutualised. - * - * - * Since SSE2 opcodes do not have facilities for shitfting full 128-bit - * values with bit precision, we have to break down values into 64-bit - * chunks. We number chunks from 0 to 3 in left to right order. - */ - -/* - * Byte-swap a complete 128-bit value. This normally uses - * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode). - * However, this crashes old Clang versions, so, for Clang before 3.8, - * we use an alternate (and less efficient) version. - */ -#if BR_CLANG && !BR_CLANG_3_8 -#define BYTESWAP_DECL -#define BYTESWAP_PREP (void)0 -#define BYTESWAP(x) do { \ - __m128i byteswap1, byteswap2; \ - byteswap1 = (x); \ - byteswap2 = _mm_srli_epi16(byteswap1, 8); \ - byteswap1 = _mm_slli_epi16(byteswap1, 8); \ - byteswap1 = _mm_or_si128(byteswap1, byteswap2); \ - byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \ - byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \ - (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \ - } while (0) -#else -#define BYTESWAP_DECL __m128i byteswap_index; -#define BYTESWAP_PREP do { \ - byteswap_index = _mm_set_epi8( \ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ - } while (0) -#define BYTESWAP(x) do { \ - (x) = _mm_shuffle_epi8((x), byteswap_index); \ - } while (0) #endif - /* * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so, * for that compiler, we use inline assembly. Inline assembly is @@ -121,8 +72,8 @@ BR_TARGETS_X86_UP * We use a target of "sse2" only, so that Clang may still handle the * '__m128i' type and allocate SSE2 registers. */ -#if BR_CLANG -BR_TARGET("sse2") +#ifdef __clang__ + BR_TARGET("sse2") static inline __m128i pclmulqdq00(__m128i x, __m128i y) { @@ -158,22 +109,6 @@ pclmulqdq11(__m128i x, __m128i y) kx = _mm_xor_si128(k0, k1); \ } while (0) -/* - * Left-shift by 1 bit a 256-bit value (in four 64-bit words). - */ -#define SL_256(x0, x1, x2, x3) do { \ - x0 = _mm_or_si128( \ - _mm_slli_epi64(x0, 1), \ - _mm_srli_epi64(x1, 63)); \ - x1 = _mm_or_si128( \ - _mm_slli_epi64(x1, 1), \ - _mm_srli_epi64(x2, 63)); \ - x2 = _mm_or_si128( \ - _mm_slli_epi64(x2, 1), \ - _mm_srli_epi64(x3, 63)); \ - x3 = _mm_slli_epi64(x3, 1); \ - } while (0) - /* * Perform reduction in GF(2^128). The 256-bit value is in x0..x3; * result is written in x0..x1. @@ -213,150 +148,22 @@ pclmulqdq11(__m128i x, __m128i y) _mm_slli_epi64(x2, 57))); \ } while (0) -/* - * Square value kw into (dw,dx). - */ -#define SQUARE_F128(kw, dw, dx) do { \ - __m128i z0, z1, z2, z3; \ - z1 = pclmulqdq11(kw, kw); \ - z3 = pclmulqdq00(kw, kw); \ - z0 = _mm_shuffle_epi32(z1, 0x0E); \ - z2 = _mm_shuffle_epi32(z3, 0x0E); \ - SL_256(z0, z1, z2, z3); \ - REDUCE_F128(z0, z1, z2, z3); \ - PBK(z0, z1, dw, dx); \ - } while (0) /* see bearssl_hash.h */ BR_TARGET("ssse3,pclmul") -void -br_ghash_pclmul(void *y, const void *h, const void *data, size_t len) +static +void pv_mul_y_h(polyval_t *pv) { - const unsigned char *buf1, *buf2; - unsigned char tmp[64]; - size_t num4, num1; __m128i yw, h1w, h1x; - BYTESWAP_DECL - /* - * We split data into two chunks. First chunk starts at buf1 - * and contains num4 blocks of 64-byte values. Second chunk - * starts at buf2 and contains num1 blocks of 16-byte values. - * We want the first chunk to be as large as possible. - */ - buf1 = data; - num4 = len >> 6; - len &= 63; - buf2 = buf1 + (num4 << 6); - num1 = (len + 15) >> 4; - if ((len & 15) != 0) { - memcpy(tmp, buf2, len); - memset(tmp + len, 0, (num1 << 4) - len); - buf2 = tmp; - } + h1w = pv->h; + BK(h1w, h1x); - /* - * Preparatory step for endian conversions. - */ - BYTESWAP_PREP; - - /* - * Load y and h. - */ - yw = _mm_loadu_si128(y); - h1w = _mm_loadu_si128(h); - BYTESWAP(yw); - BYTESWAP(h1w); - BK(h1w, h1x); - - if (num4 > 0) { - __m128i h2w, h2x, h3w, h3x, h4w, h4x; - __m128i t0, t1, t2, t3; - - /* - * Compute h2 = h^2. - */ - SQUARE_F128(h1w, h2w, h2x); - - /* - * Compute h3 = h^3 = h*(h^2). - */ - t1 = pclmulqdq11(h1w, h2w); - t3 = pclmulqdq00(h1w, h2w); - t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x), - _mm_xor_si128(t1, t3)); - t0 = _mm_shuffle_epi32(t1, 0x0E); - t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); - t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); - SL_256(t0, t1, t2, t3); - REDUCE_F128(t0, t1, t2, t3); - PBK(t0, t1, h3w, h3x); - - /* - * Compute h4 = h^4 = (h^2)^2. - */ - SQUARE_F128(h2w, h4w, h4x); - - while (num4 -- > 0) { - __m128i aw0, aw1, aw2, aw3; - __m128i ax0, ax1, ax2, ax3; - - aw0 = _mm_loadu_si128((void *)(buf1 + 0)); - aw1 = _mm_loadu_si128((void *)(buf1 + 16)); - aw2 = _mm_loadu_si128((void *)(buf1 + 32)); - aw3 = _mm_loadu_si128((void *)(buf1 + 48)); - BYTESWAP(aw0); - BYTESWAP(aw1); - BYTESWAP(aw2); - BYTESWAP(aw3); - buf1 += 64; - - aw0 = _mm_xor_si128(aw0, yw); - BK(aw1, ax1); - BK(aw2, ax2); - BK(aw3, ax3); - BK(aw0, ax0); - - t1 = _mm_xor_si128( - _mm_xor_si128( - pclmulqdq11(aw0, h4w), - pclmulqdq11(aw1, h3w)), - _mm_xor_si128( - pclmulqdq11(aw2, h2w), - pclmulqdq11(aw3, h1w))); - t3 = _mm_xor_si128( - _mm_xor_si128( - pclmulqdq00(aw0, h4w), - pclmulqdq00(aw1, h3w)), - _mm_xor_si128( - pclmulqdq00(aw2, h2w), - pclmulqdq00(aw3, h1w))); - t2 = _mm_xor_si128( - _mm_xor_si128( - pclmulqdq00(ax0, h4x), - pclmulqdq00(ax1, h3x)), - _mm_xor_si128( - pclmulqdq00(ax2, h2x), - pclmulqdq00(ax3, h1x))); - t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); - t0 = _mm_shuffle_epi32(t1, 0x0E); - t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); - t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); - SL_256(t0, t1, t2, t3); - REDUCE_F128(t0, t1, t2, t3); - yw = _mm_unpacklo_epi64(t1, t0); - } - } - - while (num1 -- > 0) { + { __m128i aw, ax; __m128i t0, t1, t2, t3; - aw = _mm_loadu_si128((void *)buf2); - BYTESWAP(aw); - buf2 += 16; - - aw = _mm_xor_si128(aw, yw); + aw = pv->y; BK(aw, ax); t1 = pclmulqdq11(aw, h1w); @@ -366,24 +173,12 @@ br_ghash_pclmul(void *y, const void *h, const void *data, size_t len) t0 = _mm_shuffle_epi32(t1, 0x0E); t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); +#if 0 // This step is GHASH-only. SL_256(t0, t1, t2, t3); +#endif REDUCE_F128(t0, t1, t2, t3); yw = _mm_unpacklo_epi64(t1, t0); } - BYTESWAP(yw); - _mm_storeu_si128(y, yw); -} - -BR_TARGETS_X86_DOWN - -#else - -/* see bearssl_hash.h */ -br_ghash -br_ghash_pclmul_get(void) -{ - return 0; + pv->y = yw; } - -#endif diff --git a/src/ext/polyval/polyval.c b/src/ext/polyval/polyval.c index 9140b9457e..7c6df7ab08 100644 --- a/src/ext/polyval/polyval.c +++ b/src/ext/polyval/polyval.c @@ -12,10 +12,10 @@ #include -typedef struct pv_u128_ u128; +typedef pv_u128_ u128; static inline u128 u128_from_bytes(const uint8_t *bytes); -static inline void u128_to_bytes(u128 u128, uint8_t *bytes_out); +static inline void u128_to_bytes(u128, uint8_t *bytes_out); static inline void pv_xor(polyval_t *, u128); static inline void pv_init_extra(polyval_t *pv); @@ -23,8 +23,7 @@ static inline void pv_init_extra(polyval_t *pv); /** * Within the polyval struct, perform "y *= h". */ -static -void pv_mul_y_h(polyval_t *); +static void pv_mul_y_h(polyval_t *); #ifdef WORDS_BIG_ENDIAN #ifdef __GNUC__ @@ -64,7 +63,32 @@ bswap32(uint64_t v) #define convert_byte_order32(x) (x) #endif -#ifdef PV_USE_CTMUL64 + +#ifdef PV_USE_PCLMUL + +#include "ext/polyval/pclmul.c" + +static inline u128 +u128_from_bytes(const uint8_t *bytes) +{ + return _mm_loadu_si128((const u128*)bytes); +} +static inline void +u128_to_bytes(u128 val, uint8_t *bytes_out) +{ + _mm_storeu_si128((u128*)bytes_out, val); +} +static inline void +pv_xor(polyval_t *pv, u128 v) +{ + pv->y = _mm_xor_si128(pv->y, v); +} +static inline void +pv_init_extra(polyval_t *pv) +{ + (void)pv; +} +#elif defined(PV_USE_CTMUL64) #include "ext/polyval/ctmul64.c" diff --git a/src/ext/polyval/polyval.h b/src/ext/polyval/polyval.h index 4e21fcbb85..8ad48d8770 100644 --- a/src/ext/polyval/polyval.h +++ b/src/ext/polyval/polyval.h @@ -12,18 +12,21 @@ #include "orconfig.h" #include "lib/cc/torint.h" -#define PV_USE_CTMUL +#define PV_USE_PCLMUL -#ifdef PV_USE_CTMUL64 +#ifdef PV_USE_PCLMUL +#include +typedef __m128i pv_u128_; +#elif defined(PV_USE_CTMUL64) /** A 128-bit integer represented as its low and high portion. */ -struct pv_u128_ { +typedef struct pv_u128_ { uint64_t lo; uint64_t hi; } pv_u128_; #elif defined(PV_USE_CTMUL) -struct pv_u128_ { +typedef struct pv_u128_ { uint32_t v[4]; -}; +} pv_u128_; #endif /** @@ -31,14 +34,14 @@ struct pv_u128_ { **/ typedef struct polyval_t { /** The key itself. */ - struct pv_u128_ h; + pv_u128_ h; #ifdef PV_USE_CTMUL64 /** The elements of the key in bit-reversed form. * (Used as an optimization.) */ - struct pv_u128_ hr; + pv_u128_ hr; #endif /** The accumulator */ - struct pv_u128_ y; + pv_u128_ y; } polyval_t; #define POLYVAL_KEY_LEN 16 -- 2.47.2