From: Rafael Kitover Date: Tue, 23 May 2023 21:04:36 +0000 (+0000) Subject: bump: Update blake3 1.3.1 -> 71a2646 X-Git-Tag: v4.8.2~9^2~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e72bcaffc7f3a961b940e48b400065b520afcff0;p=thirdparty%2Fccache.git bump: Update blake3 1.3.1 -> 71a2646 Update blake3 to master, commit 71a2646 . Signed-off-by: Rafael Kitover --- diff --git a/LICENSE.adoc b/LICENSE.adoc index f4d26d6a7..af8eee67d 100644 --- a/LICENSE.adoc +++ b/LICENSE.adoc @@ -72,8 +72,9 @@ along with this program. If not, see . === src/third_party/blake3/blake3_* -This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.3.1 with -the following license: +This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] (git SHA: +https://github.com/BLAKE3-team/BLAKE3/commit/71a2646180c787e22f8681c5fec7655a0ad51e99[71a2646]) +with the following license: ---- This work is released into the public domain with CC0 1.0. Alternatively, it is diff --git a/src/third_party/blake3/blake3.c b/src/third_party/blake3/blake3.c index 1239433c6..dc343f91c 100644 --- a/src/third_party/blake3/blake3.c +++ b/src/third_party/blake3/blake3.c @@ -246,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned -// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. @@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable ouput.) Note that this function is +// wouldn't be able to implement exendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // diff --git a/src/third_party/blake3/blake3.h b/src/third_party/blake3/blake3.h index 7caf9b4b5..b280db388 100644 --- a/src/third_party/blake3/blake3.h +++ b/src/third_party/blake3/blake3.h @@ -8,7 +8,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.3.1" +#define BLAKE3_VERSION_STRING "1.3.3" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/src/third_party/blake3/blake3_avx2_x86-64_windows_gnu.S b/src/third_party/blake3/blake3_avx2_x86-64_windows_gnu.S index bb58d2ae6..3d4be4a7d 100644 --- a/src/third_party/blake3/blake3_avx2_x86-64_windows_gnu.S +++ b/src/third_party/blake3/blake3_avx2_x86-64_windows_gnu.S @@ -1784,7 +1784,7 @@ blake3_hash_many_avx2: vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b -.section .rodata +.section .rdata .p2align 6 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 diff --git a/src/third_party/blake3/blake3_avx512.c b/src/third_party/blake3/blake3_avx512.c index 9c35b08c4..334d82dc7 100644 --- a/src/third_party/blake3/blake3_avx512.c +++ b/src/third_party/blake3/blake3_avx512.c @@ -1047,13 +1047,26 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, INLINE void load_counters16(uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi) { const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); - const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - const __m512i add1 = _mm512_and_si512(mask, add0); - __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1); - __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); - __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1)); - *out_lo = l; - *out_hi = h; + const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i masked_deltas = _mm512_and_si512(deltas, mask); + const __m512i low_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)counter), + masked_deltas); + // The carry bit is 1 if the high bit of the word was 1 before addition and is + // 0 after. + // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to + // compute the carry bits here, and originally we did, but that intrinsic is + // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271. + const __m512i carries = _mm512_srli_epi32( + _mm512_andnot_si512( + low_words, // 0 after (gets inverted by andnot) + _mm512_set1_epi32((int32_t)counter)), // and 1 before + 31); + const __m512i high_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)(counter >> 32)), + carries); + *out_lo = low_words; + *out_hi = high_words; } static diff --git a/src/third_party/blake3/blake3_avx512_x86-64_windows_gnu.S b/src/third_party/blake3/blake3_avx512_x86-64_windows_gnu.S index e10b9f36c..ba4fc5fa9 100644 --- a/src/third_party/blake3/blake3_avx512_x86-64_windows_gnu.S +++ b/src/third_party/blake3/blake3_avx512_x86-64_windows_gnu.S @@ -2587,7 +2587,7 @@ blake3_compress_xof_avx512: add rsp, 72 ret -.section .rodata +.section .rdata .p2align 6 INDEX0: .long 0, 1, 2, 3, 16, 17, 18, 19 diff --git a/src/third_party/blake3/blake3_dispatch.c b/src/third_party/blake3/blake3_dispatch.c index b49805897..2ab0093ee 100644 --- a/src/third_party/blake3/blake3_dispatch.c +++ b/src/third_party/blake3/blake3_dispatch.c @@ -10,14 +10,14 @@ #elif defined(__GNUC__) #include #else -#error "Unimplemented!" +#undef IS_X86 /* Unimplemented! */ #endif #endif #define MAYBE_UNUSED(x) (void)((x)) #if defined(IS_X86) -static uint64_t xgetbv() { +static uint64_t xgetbv(void) { #if defined(_MSC_VER) return _xgetbv(0); #else @@ -82,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */ static #endif enum cpu_feature - get_cpu_features() { + get_cpu_features(void) { if (g_cpu_features != UNDEFINED) { return g_cpu_features; @@ -101,7 +101,7 @@ static if (*edx & (1UL << 26)) features |= SSE2; #endif - if (*ecx & (1UL << 0)) + if (*ecx & (1UL << 9)) features |= SSSE3; if (*ecx & (1UL << 19)) features |= SSE41; diff --git a/src/third_party/blake3/blake3_impl.h b/src/third_party/blake3/blake3_impl.h index cc5672f22..3ba9ceb04 100644 --- a/src/third_party/blake3/blake3_impl.h +++ b/src/third_party/blake3/blake3_impl.h @@ -46,7 +46,6 @@ enum blake3_flags { #if defined(_MSC_VER) #include #endif -#include #endif #if !defined(BLAKE3_USE_NEON) @@ -88,7 +87,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = { /* x is assumed to be nonzero. */ static unsigned int highest_one(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) - return 63 ^ __builtin_clzll(x); + return 63 ^ (unsigned int)__builtin_clzll(x); #elif defined(_MSC_VER) && defined(IS_X86_64) unsigned long index; _BitScanReverse64(&index, x); @@ -118,7 +117,7 @@ static unsigned int highest_one(uint64_t x) { // Count the number of 1 bits. INLINE unsigned int popcnt(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) - return __builtin_popcountll(x); + return (unsigned int)__builtin_popcountll(x); #else unsigned int count = 0; while (x != 0) { diff --git a/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S b/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S index 8852ba597..4facb50e7 100644 --- a/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S +++ b/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S @@ -2301,7 +2301,7 @@ blake3_compress_xof_sse2: ret -.section .rodata +.section .rdata .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 diff --git a/src/third_party/blake3/blake3_sse41_x86-64_windows_gnu.S b/src/third_party/blake3/blake3_sse41_x86-64_windows_gnu.S index 60d0a4042..02083f9d5 100644 --- a/src/third_party/blake3/blake3_sse41_x86-64_windows_gnu.S +++ b/src/third_party/blake3/blake3_sse41_x86-64_windows_gnu.S @@ -2042,7 +2042,7 @@ blake3_compress_xof_sse41: ret -.section .rodata +.section .rdata .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85