From caf3acbebf481a53a6ea00e98fde81d9a0c1819a Mon Sep 17 00:00:00 2001 From: Remi Gacogne Date: Sun, 1 Feb 2026 14:17:25 +0100 Subject: [PATCH] ext/ipcrypt2: Add missing softaes/untrinsics.h header Signed-off-by: Remi Gacogne --- ext/ipcrypt2/softaes/untrinsics.h | 711 ++++++++++++++++++ .../ext/ipcrypt2/softaes/untrinsics.h | 1 + 2 files changed, 712 insertions(+) create mode 100644 ext/ipcrypt2/softaes/untrinsics.h create mode 120000 pdns/dnsdistdist/ext/ipcrypt2/softaes/untrinsics.h diff --git a/ext/ipcrypt2/softaes/untrinsics.h b/ext/ipcrypt2/softaes/untrinsics.h new file mode 100644 index 0000000000..da132bbb28 --- /dev/null +++ b/ext/ipcrypt2/softaes/untrinsics.h @@ -0,0 +1,711 @@ +/** + * Untrinsics - Header-only portable implementations of common Intel intrinsics + * for cryptographic implementations. + * https://github.com/jedisct1/untrinsics + * (C) 2025 Frank Denis - Public Domain. + */ + +#ifndef untrinsics_H +#define untrinsics_H + +#define __untrinsics__ 1 + +#include +#include + +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif +#if !(__has_attribute(aligned) || defined(__GNUC__) || defined(__clang__) || defined(__attribute__)) +# define __attribute__(x) +#endif + +typedef union { + uint8_t b[16]; + uint32_t w[4]; + uint64_t q[2]; +} __m128i __attribute__((aligned(16))); + +/* clang-format off */ + +static const uint8_t UNTRINSICS_SBOX[256] __attribute__((aligned(64))) = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +}; + +static const uint8_t UNTRINSICS_INV_SBOX[256] __attribute__((aligned(64))) = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +}; + +/* clang-format on */ + +static volatile uint8_t untrinsics_optblocker_u8; +static volatile uint32_t untrinsics_optblocker_u32; +static volatile uint64_t untrinsics_optblocker_u64; + +#ifdef UNTRINSICS_MITIGATE +static inline uint8_t +untrinsics_sbox(const uint8_t x) +{ + uint32_t optblocker_u32 = untrinsics_optblocker_u32; + uint8_t result = 0; + + for (int i = 0; i < 256; i++) { + uint32_t diff = (uint32_t) (i ^ x); + uint32_t mask = (((diff - 1) >> 29) ^ optblocker_u32) >> 2; + result |= UNTRINSICS_SBOX[i] & -(uint8_t) mask; + } + return result; +} + +static inline uint8_t +untrinsics_inv_sbox(const uint8_t x) +{ + uint32_t optblocker_u32 = untrinsics_optblocker_u32; + uint8_t result = 0; + + for (int i = 0; i < 256; i++) { + uint32_t diff = (uint32_t) (i ^ x); + uint32_t mask = (((diff - 1) >> 29) ^ optblocker_u32) >> 2; + result |= UNTRINSICS_INV_SBOX[i] & -(uint8_t) mask; + } + return result; +} +#else +# define untrinsics_sbox(x) UNTRINSICS_SBOX[x] +# define untrinsics_inv_sbox(x) UNTRINSICS_INV_SBOX[x] +#endif + +/* Multiply by x in GF(2^8) using the AES polynomial (usually compiled to branchless code) */ +static inline uint8_t +untrinsics_xtime(uint8_t x) +{ + return (uint8_t) ((x << 1) ^ ((x & 0x80) ? 0x1B : 0)); +} + +/* Multiply by 2 (MixColumns) */ +static inline uint8_t +untrinsics_mul2(uint8_t x) +{ + return untrinsics_xtime(x); +} + +/* Multiply by 3 (MixColumns) */ +static inline uint8_t +untrinsics_mul3(uint8_t x) +{ + return (uint8_t) (untrinsics_xtime(x) ^ x); +} + +/* Multiply by 9 (InvMixColumns) */ +static inline uint8_t +untrinsics_mul9(uint8_t x) +{ + uint8_t t2 = untrinsics_xtime(x); + uint8_t t4 = untrinsics_xtime(t2); + uint8_t t8 = untrinsics_xtime(t4); + return (uint8_t) (t8 ^ x); +} + +/* Multiply by 0x0B (InvMixColumns) */ +static inline uint8_t +untrinsics_mul0b(uint8_t x) +{ + uint8_t t2 = untrinsics_xtime(x); + uint8_t t4 = untrinsics_xtime(t2); + uint8_t t8 = untrinsics_xtime(t4); + return (uint8_t) (t8 ^ t2 ^ x); +} + +/* Multiply by 0x0D (InvMixColumns) */ +static inline uint8_t +untrinsics_mul0d(uint8_t x) +{ + uint8_t t2 = untrinsics_xtime(x); + uint8_t t4 = untrinsics_xtime(t2); + uint8_t t8 = untrinsics_xtime(t4); + return (uint8_t) (t8 ^ t4 ^ x); +} + +/* Multiply by 0x0E (InvMixColumns) */ +static inline uint8_t +untrinsics_mul0e(uint8_t x) +{ + uint8_t t2 = untrinsics_xtime(x); + uint8_t t4 = untrinsics_xtime(t2); + uint8_t t8 = untrinsics_xtime(t4); + return (uint8_t) (t8 ^ t4 ^ t2); +} + +/* Combine SubBytes and ShiftRows (forward) */ +static inline void +untrinsics_sub_shiftrows(uint8_t s[16]) +{ + uint8_t tmp[16]; + tmp[0] = untrinsics_sbox(s[0]); + tmp[1] = untrinsics_sbox(s[5]); + tmp[2] = untrinsics_sbox(s[10]); + tmp[3] = untrinsics_sbox(s[15]); + tmp[4] = untrinsics_sbox(s[4]); + tmp[5] = untrinsics_sbox(s[9]); + tmp[6] = untrinsics_sbox(s[14]); + tmp[7] = untrinsics_sbox(s[3]); + tmp[8] = untrinsics_sbox(s[8]); + tmp[9] = untrinsics_sbox(s[13]); + tmp[10] = untrinsics_sbox(s[2]); + tmp[11] = untrinsics_sbox(s[7]); + tmp[12] = untrinsics_sbox(s[12]); + tmp[13] = untrinsics_sbox(s[1]); + tmp[14] = untrinsics_sbox(s[6]); + tmp[15] = untrinsics_sbox(s[11]); + memcpy(s, tmp, 16); +} + +/* Combine InvSubBytes and InvShiftRows */ +static inline void +untrinsics_invsub_shiftrows(uint8_t s[16]) +{ + uint8_t tmp[16]; + tmp[0] = untrinsics_inv_sbox(s[0]); + tmp[1] = untrinsics_inv_sbox(s[13]); + tmp[2] = untrinsics_inv_sbox(s[10]); + tmp[3] = untrinsics_inv_sbox(s[7]); + tmp[4] = untrinsics_inv_sbox(s[4]); + tmp[5] = untrinsics_inv_sbox(s[1]); + tmp[6] = untrinsics_inv_sbox(s[14]); + tmp[7] = untrinsics_inv_sbox(s[11]); + tmp[8] = untrinsics_inv_sbox(s[8]); + tmp[9] = untrinsics_inv_sbox(s[5]); + tmp[10] = untrinsics_inv_sbox(s[2]); + tmp[11] = untrinsics_inv_sbox(s[15]); + tmp[12] = untrinsics_inv_sbox(s[12]); + tmp[13] = untrinsics_inv_sbox(s[9]); + tmp[14] = untrinsics_inv_sbox(s[6]); + tmp[15] = untrinsics_inv_sbox(s[3]); + memcpy(s, tmp, 16); +} + +/* MixColumns transformation (forward) */ +static inline void +untrinsics_mixcolumns(uint8_t s[16]) +{ + for (int c = 0; c < 4; c++) { + int i = 4 * c; + uint8_t a0 = s[i], a1 = s[i + 1], a2 = s[i + 2], a3 = s[i + 3]; + s[i] = (uint8_t) (untrinsics_mul2(a0) ^ untrinsics_mul3(a1) ^ a2 ^ a3); + s[i + 1] = (uint8_t) (a0 ^ untrinsics_mul2(a1) ^ untrinsics_mul3(a2) ^ a3); + s[i + 2] = (uint8_t) (a0 ^ a1 ^ untrinsics_mul2(a2) ^ untrinsics_mul3(a3)); + s[i + 3] = (uint8_t) (untrinsics_mul3(a0) ^ a1 ^ a2 ^ untrinsics_mul2(a3)); + } +} + +/* InvMixColumns transformation */ +static inline void +untrinsics_inv_mixcolumns(uint8_t s[16]) +{ + for (int c = 0; c < 4; c++) { + int i = 4 * c; + uint8_t a0 = s[i], a1 = s[i + 1], a2 = s[i + 2], a3 = s[i + 3]; + s[i] = (uint8_t) (untrinsics_mul0e(a0) ^ untrinsics_mul0b(a1) ^ untrinsics_mul0d(a2) ^ + untrinsics_mul9(a3)); + s[i + 1] = (uint8_t) (untrinsics_mul9(a0) ^ untrinsics_mul0e(a1) ^ untrinsics_mul0b(a2) ^ + untrinsics_mul0d(a3)); + s[i + 2] = (uint8_t) (untrinsics_mul0d(a0) ^ untrinsics_mul9(a1) ^ untrinsics_mul0e(a2) ^ + untrinsics_mul0b(a3)); + s[i + 3] = (uint8_t) (untrinsics_mul0b(a0) ^ untrinsics_mul0d(a1) ^ untrinsics_mul9(a2) ^ + untrinsics_mul0e(a3)); + } +} + +/* Rotate a 32-bit word right by 8 bits */ +static inline uint32_t +untrinsics_rot_word(const uint32_t x) +{ + return (x >> 8) | (x << 24); +} + +/* Apply S-box to each byte in a 32-bit word */ +static inline uint32_t +untrinsics_sub_word(const uint32_t x) +{ + return ((uint32_t) untrinsics_sbox((x >> 24) & 0xff) << 24) | + ((uint32_t) untrinsics_sbox((x >> 16) & 0xff) << 16) | + ((uint32_t) untrinsics_sbox((x >> 8) & 0xff) << 8) | + ((uint32_t) untrinsics_sbox(x & 0xff)); +} + +/* Copy __m128i value */ +static inline __m128i +untrinsics_copy(const __m128i a) +{ + __m128i r; + memcpy(r.b, a.b, 16); + return r; +} + +/* AES encryption round */ +static inline __m128i +_mm_aesenc_si128(const __m128i a_, const __m128i rk) +{ + __m128i a = untrinsics_copy(a_); + untrinsics_sub_shiftrows(a.b); + untrinsics_mixcolumns(a.b); + for (int i = 0; i < 16; i++) + a.b[i] ^= rk.b[i]; + return a; +} + +/* Final AES encryption round */ +static inline __m128i +_mm_aesenclast_si128(const __m128i a_, const __m128i rk) +{ + __m128i a = untrinsics_copy(a_); + untrinsics_sub_shiftrows(a.b); + for (int i = 0; i < 16; i++) + a.b[i] ^= rk.b[i]; + return a; +} + +/* AES decryption round */ +static inline __m128i +_mm_aesdec_si128(const __m128i a_, const __m128i rk) +{ + __m128i a = untrinsics_copy(a_); + untrinsics_invsub_shiftrows(a.b); + untrinsics_inv_mixcolumns(a.b); + for (int i = 0; i < 16; i++) + a.b[i] ^= rk.b[i]; + return a; +} + +/* Final AES decryption round */ +static inline __m128i +_mm_aesdeclast_si128(const __m128i a_, const __m128i rk) +{ + __m128i a = untrinsics_copy(a_); + untrinsics_invsub_shiftrows(a.b); + for (int i = 0; i < 16; i++) + a.b[i] ^= rk.b[i]; + return a; +} + +/* Transform encryption round key to decryption key */ +static inline __m128i +_mm_aesimc_si128(const __m128i a_) +{ + __m128i a = untrinsics_copy(a_); + untrinsics_inv_mixcolumns(a.b); + return a; +} + +/* Key expansion assist */ +static inline __m128i +_mm_aeskeygenassist_si128(const __m128i a, const uint8_t rcon) +{ + __m128i dst; + const uint32_t x1 = a.w[1]; + const uint32_t x3 = a.w[3]; + const uint32_t sx1 = untrinsics_sub_word(x1); + const uint32_t sx3 = untrinsics_sub_word(x3); + + dst.w[0] = sx1; + dst.w[1] = untrinsics_rot_word(sx1) ^ rcon; + dst.w[2] = sx3; + dst.w[3] = untrinsics_rot_word(sx3) ^ rcon; + return dst; +} + +/* Carry-less multiplication of selected 64-bit lanes. + imm: bit 0x01 selects lane from a, 0x10 from b. +*/ +static inline __m128i +_mm_clmulepi64_si128(const __m128i a, const __m128i b, const int imm) +{ + __m128i r; + uint64_t x = (imm & 1) ? a.q[1] : a.q[0]; + uint64_t y = (imm & 0x10) ? b.q[1] : b.q[0]; + uint64_t r_lo = 0, r_hi = 0; + { + uint64_t bit = y & 1ULL; + uint64_t mask = 0ULL - bit; + r_lo ^= x & mask; + } + for (int i = 1; i < 64; i++) { + uint64_t bit = (y >> i) & 1ULL; + uint64_t mask = 0ULL - bit; + r_lo ^= (x << i) & mask; + r_hi ^= (x >> (64 - i)) & mask; + } + r.q[0] = r_lo; + r.q[1] = r_hi; + return r; +} + +/* Load 128 bits from unaligned memory */ +static inline __m128i +_mm_loadu_si128(const void* const p) +{ + __m128i r; + memcpy(r.b, p, 16); + return r; +} + +/* Store 128 bits to unaligned memory */ +static inline void +_mm_storeu_si128(void* const p, const __m128i a) +{ + memcpy(p, a.b, 16); +} + +/* Bitwise XOR of 128-bit values */ +static inline __m128i +_mm_xor_si128(const __m128i a, const __m128i b) +{ + __m128i r; + for (int i = 0; i < 16; i++) + r.b[i] = (uint8_t) (a.b[i] ^ b.b[i]); + return r; +} + +/* Bitwise OR of 128-bit values */ +static inline __m128i +_mm_or_si128(const __m128i a, const __m128i b) +{ + __m128i r; + for (int i = 0; i < 16; i++) + r.b[i] = (uint8_t) (a.b[i] | b.b[i]); + return r; +} + +/* Bitwise AND of 128-bit values */ +static inline __m128i +_mm_and_si128(const __m128i a, const __m128i b) +{ + __m128i r; + for (int i = 0; i < 16; i++) + r.b[i] = (uint8_t) (a.b[i] & b.b[i]); + return r; +} + +/* Set __m128i from two 64-bit integers (high, low) */ +static inline __m128i +_mm_set_epi64x(const long long high, const long long low) +{ + __m128i r; + r.q[0] = (uint64_t) low; + r.q[1] = (uint64_t) high; + return r; +} + +/* Shift left by imm bytes (zero-fill) */ +static inline __m128i +_mm_slli_si128(const __m128i a, const int imm) +{ + __m128i r; + if (imm <= 0) + return a; + if (imm >= 16) { + memset(r.b, 0, 16); + return r; + } + memset(r.b, 0, imm); + memcpy(r.b + imm, a.b, 16 - imm); + return r; +} + +/* Shift right by imm bytes (zero-fill) */ +static inline __m128i +_mm_srli_si128(const __m128i a, const int imm) +{ + __m128i r; + if (imm <= 0) + return a; + if (imm >= 16) { + memset(r.b, 0, 16); + return r; + } + memcpy(r.b, a.b + imm, 16 - imm); + memset(r.b + (16 - imm), 0, imm); + return r; +} + +#ifndef _MM_SHUFFLE +# define _MM_SHUFFLE(z, y, x, w) (((z & 3) << 6) | ((y & 3) << 4) | ((x & 3) << 2) | (w & 3)) +#endif + +/* Shuffle 32-bit words */ +static inline __m128i +_mm_shuffle_epi32(const __m128i a, const int imm) +{ + __m128i r; + int w0 = imm & 0x3; + int w1 = (imm >> 2) & 0x3; + int w2 = (imm >> 4) & 0x3; + int w3 = (imm >> 6) & 0x3; + r.w[0] = a.w[w0]; + r.w[1] = a.w[w1]; + r.w[2] = a.w[w2]; + r.w[3] = a.w[w3]; + return r; +} + +/* Shuffle bytes using a mask; if mask bit 7 is set, output zero */ +static inline __m128i +_mm_shuffle_epi8(const __m128i a, const __m128i b) +{ + __m128i r; + for (int i = 0; i < 16; i++) { + uint8_t index = b.b[i] & 0x0F; + uint8_t mask = b.b[i] & 0x80; + r.b[i] = mask ? 0 : a.b[index]; + } + return r; +} + +/* Load 64 bits from unaligned memory; zero upper half */ +static inline __m128i +_mm_loadu_si64(const void* const mem_addr) +{ + __m128i r; + uint64_t tmp; + memcpy(&tmp, mem_addr, 8); + r.q[0] = tmp; + r.q[1] = 0; + return r; +} + +/* Set __m128i from 16 int8_t values */ +static inline __m128i +_mm_setr_epi8(const int8_t b0, const int8_t b1, const int8_t b2, const int8_t b3, const int8_t b4, + const int8_t b5, const int8_t b6, const int8_t b7, const int8_t b8, const int8_t b9, + const int8_t b10, const int8_t b11, const int8_t b12, const int8_t b13, + const int8_t b14, const int8_t b15) +{ + __m128i r; + r.b[0] = b0; + r.b[1] = b1; + r.b[2] = b2; + r.b[3] = b3; + r.b[4] = b4; + r.b[5] = b5; + r.b[6] = b6; + r.b[7] = b7; + r.b[8] = b8; + r.b[9] = b9; + r.b[10] = b10; + r.b[11] = b11; + r.b[12] = b12; + r.b[13] = b13; + r.b[14] = b14; + r.b[15] = b15; + return r; +} + +/* Set __m128i from 16 int values */ +static inline __m128i +_mm_setr_epi32(const int e0, const int e1, const int e2, const int e3) +{ + __m128i v; + v.w[0] = (uint32_t) e0; + v.w[1] = (uint32_t) e1; + v.w[2] = (uint32_t) e2; + v.w[3] = (uint32_t) e3; + return v; +} + +/* Logical right shift each 32-bit lane by imm8 */ +static inline __m128i +_mm_srli_epi32(const __m128i v, const int imm8) +{ + __m128i r; + r.w[0] = v.w[0] >> imm8; + r.w[1] = v.w[1] >> imm8; + r.w[2] = v.w[2] >> imm8; + r.w[3] = v.w[3] >> imm8; + return r; +} + +/* Logical left shift each 32-bit lane by imm8 */ +static inline __m128i +_mm_slli_epi32(const __m128i v, const int imm8) +{ + __m128i r; + r.w[0] = v.w[0] << imm8; + r.w[1] = v.w[1] << imm8; + r.w[2] = v.w[2] << imm8; + r.w[3] = v.w[3] << imm8; + return r; +} + +/* Logical right shift each 16-bit lane by imm8 */ +static inline __m128i +_mm_srli_epi16(const __m128i v, const int imm8) +{ + __m128i r; + for (int i = 0; i < 8; i++) { + uint16_t val = (uint16_t)v.b[i * 2] | ((uint16_t)v.b[i * 2 + 1] << 8); + val >>= imm8; + r.b[i * 2] = (uint8_t)(val & 0xff); + r.b[i * 2 + 1] = (uint8_t)(val >> 8); + } + return r; +} + +/* Logical left shift each 16-bit lane by imm8 */ +static inline __m128i +_mm_slli_epi16(const __m128i v, const int imm8) +{ + __m128i r; + for (int i = 0; i < 8; i++) { + uint16_t val = (uint16_t)v.b[i * 2] | ((uint16_t)v.b[i * 2 + 1] << 8); + val <<= imm8; + r.b[i * 2] = (uint8_t)(val & 0xff); + r.b[i * 2 + 1] = (uint8_t)(val >> 8); + } + return r; +} + +/* Logical right shift each 64-bit lane by imm8 */ +static inline __m128i +_mm_srli_epi64(const __m128i v, const int imm8) +{ + __m128i r; + r.q[0] = v.q[0] >> imm8; + r.q[1] = v.q[1] >> imm8; + return r; +} + +/* Logical left shift each 64-bit lane by imm8 */ +static inline __m128i +_mm_slli_epi64(const __m128i v, const int imm8) +{ + __m128i r; + r.q[0] = v.q[0] << imm8; + r.q[1] = v.q[1] << imm8; + return r; +} + +/* Set __m128i to zero */ +static inline __m128i +_mm_setzero_si128(void) +{ + __m128i r; + memset(r.b, 0, 16); + return r; +} + +/* Set all 16 bytes to the same 8-bit value */ +static inline __m128i +_mm_set1_epi8(const int8_t a) +{ + __m128i r; + for (int i = 0; i < 16; i++) + r.b[i] = (uint8_t) a; + return r; +} + +/* Add 8-bit integers in two __m128i values */ +static inline __m128i +_mm_add_epi8(const __m128i a, const __m128i b) +{ + __m128i r; + for (int i = 0; i < 16; i++) + r.b[i] = (uint8_t) (a.b[i] + b.b[i]); + return r; +} + +/* Subtract 8-bit integers in two __m128i values */ +static inline __m128i +_mm_sub_epi8(const __m128i a, const __m128i b) +{ + __m128i r; + for (int i = 0; i < 16; i++) + r.b[i] = (uint8_t) (a.b[i] - b.b[i]); + return r; +} + +/* Add 64-bit integers in two __m128i values */ +static inline __m128i +_mm_add_epi64(const __m128i a, const __m128i b) +{ + __m128i r; + r.q[0] = a.q[0] + b.q[0]; + r.q[1] = a.q[1] + b.q[1]; + return r; +} + +/* Subtract 64-bit integers in two __m128i values */ +static inline __m128i +_mm_sub_epi64(const __m128i a, const __m128i b) +{ + __m128i r; + r.q[0] = a.q[0] - b.q[0]; + r.q[1] = a.q[1] - b.q[1]; + return r; +} + +/* Compare 16 bytes for equality; result byte is 0xFF if equal, else 0x00 */ +static inline __m128i +_mm_cmpeq_epi8(const __m128i a, const __m128i b) +{ + __m128i r; + uint64_t optblocker_u8 = untrinsics_optblocker_u8; + for (int i = 0; i < 16; i++) { + uint8_t diff = a.b[i] ^ b.b[i]; + uint8_t t = ((diff | (uint8_t) (-diff)) >> 5 ^ optblocker_u8) >> 2; + r.b[i] = -(t ^ 1); + } + return r; +} + +/* Compare 16 bytes for less than; result byte is 0xFF if a < b, else 0x00 */ +#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) + +/* _mm_testz_si128: Returns 1 if (a & b) is all zeros, 0 otherwise. */ +static inline int +_mm_testz_si128(const __m128i a, const __m128i b) +{ + uint64_t optblocker_u64 = untrinsics_optblocker_u64; + uint64_t x = (a.q[0] & b.q[0]) | (a.q[1] & b.q[1]); + return (int) (((((x | (optblocker_u64 ^ -x)) >> 61) ^ optblocker_u64) >> 2) ^ 1); +} + +/* _mm_test_all_ones: Returns 1 if all bits of a are 1, 0 otherwise. */ +static inline int +_mm_test_all_ones(const __m128i a) +{ + uint64_t optblocker_u64 = untrinsics_optblocker_u64; + uint64_t t = (a.q[0] ^ ~0ULL) | (a.q[1] ^ ~0ULL); + return (int) (((((t | (optblocker_u64 ^ -t)) >> 61) ^ optblocker_u64) >> 2) ^ 1); +} + +#endif /* UNTRINSICS_H */ diff --git a/pdns/dnsdistdist/ext/ipcrypt2/softaes/untrinsics.h b/pdns/dnsdistdist/ext/ipcrypt2/softaes/untrinsics.h new file mode 120000 index 0000000000..f37e960152 --- /dev/null +++ b/pdns/dnsdistdist/ext/ipcrypt2/softaes/untrinsics.h @@ -0,0 +1 @@ +../../../../../ext/ipcrypt2/softaes/untrinsics.h \ No newline at end of file -- 2.47.3