=== src/third_party/blake3/blake3_*
-This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.3.1 with
-the following license:
+This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] (git SHA:
+https://github.com/BLAKE3-team/BLAKE3/commit/71a2646180c787e22f8681c5fec7655a0ad51e99[71a2646])
+with the following license:
----
This work is released into the public domain with CC0 1.0. Alternatively, it is
// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
+// wouldn't be able to implement exendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
extern "C" {
#endif
-#define BLAKE3_VERSION_STRING "1.3.1"
+#define BLAKE3_VERSION_STRING "1.3.3"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
vmovdqu xmmword ptr [rbx+0x10], xmm1
jmp 4b
-.section .rodata
+.section .rdata
.p2align 6
ADD0:
.long 0, 1, 2, 3, 4, 5, 6, 7
INLINE void load_counters16(uint64_t counter, bool increment_counter,
__m512i *out_lo, __m512i *out_hi) {
const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
- const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
- const __m512i add1 = _mm512_and_si512(mask, add0);
- __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
- __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
- __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
- *out_lo = l;
- *out_hi = h;
+ const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
+ const __m512i low_words = _mm512_add_epi32(
+ _mm512_set1_epi32((int32_t)counter),
+ masked_deltas);
+ // The carry bit is 1 if the high bit of the word was 1 before addition and is
+ // 0 after.
+ // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
+ // compute the carry bits here, and originally we did, but that intrinsic is
+ // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
+ const __m512i carries = _mm512_srli_epi32(
+ _mm512_andnot_si512(
+ low_words, // 0 after (gets inverted by andnot)
+ _mm512_set1_epi32((int32_t)counter)), // and 1 before
+ 31);
+ const __m512i high_words = _mm512_add_epi32(
+ _mm512_set1_epi32((int32_t)(counter >> 32)),
+ carries);
+ *out_lo = low_words;
+ *out_hi = high_words;
}
static
add rsp, 72
ret
-.section .rodata
+.section .rdata
.p2align 6
INDEX0:
.long 0, 1, 2, 3, 16, 17, 18, 19
#elif defined(__GNUC__)
#include <immintrin.h>
#else
-#error "Unimplemented!"
+#undef IS_X86 /* Unimplemented! */
#endif
#endif
#define MAYBE_UNUSED(x) (void)((x))
#if defined(IS_X86)
-static uint64_t xgetbv() {
+static uint64_t xgetbv(void) {
#if defined(_MSC_VER)
return _xgetbv(0);
#else
static
#endif
enum cpu_feature
- get_cpu_features() {
+ get_cpu_features(void) {
if (g_cpu_features != UNDEFINED) {
return g_cpu_features;
if (*edx & (1UL << 26))
features |= SSE2;
#endif
- if (*ecx & (1UL << 0))
+ if (*ecx & (1UL << 9))
features |= SSSE3;
if (*ecx & (1UL << 19))
features |= SSE41;
#if defined(_MSC_VER)
#include <intrin.h>
#endif
-#include <immintrin.h>
#endif
#if !defined(BLAKE3_USE_NEON)
/* x is assumed to be nonzero. */
static unsigned int highest_one(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
- return 63 ^ __builtin_clzll(x);
+ return 63 ^ (unsigned int)__builtin_clzll(x);
#elif defined(_MSC_VER) && defined(IS_X86_64)
unsigned long index;
_BitScanReverse64(&index, x);
// Count the number of 1 bits.
INLINE unsigned int popcnt(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
- return __builtin_popcountll(x);
+ return (unsigned int)__builtin_popcountll(x);
#else
unsigned int count = 0;
while (x != 0) {
ret
-.section .rodata
+.section .rdata
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
ret
-.section .rodata
+.section .rdata
.p2align 6
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85