From bd2147f30307ab62dbdb8252e9c6f2ef2b1ec44f Mon Sep 17 00:00:00 2001 From: Joel Rosdahl Date: Wed, 23 Apr 2025 19:22:03 +0200 Subject: [PATCH] bump: Upgrade to BLAKE3 1.8.2 --- LICENSE.adoc | 2 +- src/third_party/blake3/CMakeLists.txt | 2 +- src/third_party/blake3/blake3/blake3.c | 71 ++++++++++++++------- src/third_party/blake3/blake3/blake3.h | 6 +- src/third_party/blake3/blake3/blake3_impl.h | 29 +++++++++ 5 files changed, 85 insertions(+), 25 deletions(-) diff --git a/LICENSE.adoc b/LICENSE.adoc index 0d5cb6ca..225d1879 100644 --- a/LICENSE.adoc +++ b/LICENSE.adoc @@ -50,7 +50,7 @@ under less restrictive terms. === src/third_party/blake3/blake3/* -This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.6.1 with the +This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.8.2 with the following license: ---- diff --git a/src/third_party/blake3/CMakeLists.txt b/src/third_party/blake3/CMakeLists.txt index 6cff8006..343f9d4c 100644 --- a/src/third_party/blake3/CMakeLists.txt +++ b/src/third_party/blake3/CMakeLists.txt @@ -1,4 +1,4 @@ -register_dependency(Blake3 BUNDLED 1.6.1) +register_dependency(Blake3 BUNDLED 1.8.2) add_library( dep_blake3 STATIC diff --git a/src/third_party/blake3/blake3/blake3.c b/src/third_party/blake3/blake3/blake3.c index 7e6d01ec..74fc2e73 100644 --- a/src/third_party/blake3/blake3/blake3.c +++ b/src/third_party/blake3/blake3/blake3.c @@ -158,10 +158,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], // Given some input larger than one chunk, return the number of bytes that // should go in the left subtree. This is the largest power-of-2 number of // chunks that leaves at least 1 byte for the right subtree. -INLINE size_t left_len(size_t content_len) { - // Subtract 1 to reserve at least one byte for the right side. content_len +INLINE size_t left_subtree_len(size_t input_len) { + // Subtract 1 to reserve at least one byte for the right side. input_len // should always be greater than BLAKE3_CHUNK_LEN. - size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN; return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; } @@ -265,11 +265,10 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { +size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb) { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. If this implementation adds multi-threading in the future, // this gives us the option of multi-threading even the 2-chunk case, which @@ -283,7 +282,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) - size_t left_input_len = left_len(input_len); + size_t left_input_len = left_subtree_len(input_len); size_t right_input_len = input_len - left_input_len; const uint8_t *right_input = &input[left_input_len]; uint64_t right_chunk_counter = @@ -303,12 +302,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, } uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + // Recurse! + size_t left_n = -1; + size_t right_n = -1; + +#if defined(BLAKE3_USE_TBB) + blake3_compress_subtree_wide_join_tbb( + key, flags, use_tbb, + // left-hand side + input, left_input_len, chunk_counter, cv_array, &left_n, + // right-hand side + right_input, right_input_len, right_chunk_counter, right_cvs, &right_n); +#else + left_n = blake3_compress_subtree_wide( + input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb); + right_n = blake3_compress_subtree_wide(right_input, right_input_len, key, + right_chunk_counter, flags, right_cvs, + use_tbb); +#endif // BLAKE3_USE_TBB // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return @@ -334,16 +345,18 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. -INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +INLINE void +compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, + uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN], + bool use_tbb) { #if defined(BLAKE3_TESTING) assert(input_len > BLAKE3_CHUNK_LEN); #endif uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); + chunk_counter, flags, cv_array, use_tbb); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because // as we just asserted, num_cvs will always be <=2 in that case. But GCC @@ -459,8 +472,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], self->cv_stack_len += 1; } -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { +INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input, + size_t input_len, bool use_tbb) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to memcpy. This comes up in practice with things like: // std::vector v; @@ -546,7 +559,7 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, - self->chunk.flags, cv_pair); + self->chunk.flags, cv_pair, use_tbb); hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); @@ -568,6 +581,20 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, } } +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = false; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} + +#if defined(BLAKE3_USE_TBB) +void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = true; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} +#endif // BLAKE3_USE_TBB + void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len) { blake3_hasher_finalize_seek(self, 0, out, out_len); diff --git a/src/third_party/blake3/blake3/blake3.h b/src/third_party/blake3/blake3/blake3.h index d917503e..08932954 100644 --- a/src/third_party/blake3/blake3/blake3.h +++ b/src/third_party/blake3/blake3/blake3.h @@ -30,7 +30,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.6.1" +#define BLAKE3_VERSION_STRING "1.8.2" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 @@ -69,6 +69,10 @@ BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const voi size_t context_len); BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); +#if defined(BLAKE3_USE_TBB) +BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len); +#endif // BLAKE3_USE_TBB BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, diff --git a/src/third_party/blake3/blake3/blake3_impl.h b/src/third_party/blake3/blake3/blake3_impl.h index 51d792a8..facd5997 100644 --- a/src/third_party/blake3/blake3/blake3_impl.h +++ b/src/third_party/blake3/blake3/blake3_impl.h @@ -9,6 +9,10 @@ #include "blake3.h" +#ifdef __cplusplus +extern "C" { +#endif + // internal flags enum blake3_flags { CHUNK_START = 1 << 0, @@ -28,6 +32,12 @@ enum blake3_flags { #define INLINE static inline __attribute__((always_inline)) #endif +#ifdef __cplusplus +#define NOEXCEPT noexcept +#else +#define NOEXCEPT +#endif + #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) #define IS_X86 #define IS_X86_64 @@ -210,6 +220,22 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blake3_simd_degree(void); +BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb); + +#if defined(BLAKE3_USE_TBB) +BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb( + // shared params + const uint32_t key[8], uint8_t flags, bool use_tbb, + // left-hand side params + const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, + uint8_t *l_cvs, size_t *l_n, + // right-hand side params + const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, + uint8_t *r_cvs, size_t *r_n) NOEXCEPT; +#endif // Declarations for implementation-specific functions. void blake3_compress_in_place_portable(uint32_t cv[8], @@ -300,5 +326,8 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, uint8_t flags_end, uint8_t *out); #endif +#ifdef __cplusplus +} +#endif #endif /* BLAKE3_IMPL_H */ -- 2.47.2