// Given some input larger than one chunk, return the number of bytes that
// should go in the left subtree. This is the largest power-of-2 number of
// chunks that leaves at least 1 byte for the right subtree.
-INLINE size_t left_len(size_t content_len) {
- // Subtract 1 to reserve at least one byte for the right side. content_len
+INLINE size_t left_subtree_len(size_t input_len) {
+ // Subtract 1 to reserve at least one byte for the right side. input_len
// should always be greater than BLAKE3_CHUNK_LEN.
- size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+ size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
}
// Why not just have the caller split the input on the first update(), instead
// of implementing this special rule? Because we don't want to limit SIMD or
// multi-threading parallelism for that update().
-static size_t blake3_compress_subtree_wide(const uint8_t *input,
- size_t input_len,
- const uint32_t key[8],
- uint64_t chunk_counter,
- uint8_t flags, uint8_t *out) {
+size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+ const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags,
+ uint8_t *out, bool use_tbb) {
// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. If this implementation adds multi-threading in the future,
// this gives us the option of multi-threading even the 2-chunk case, which
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
- size_t left_input_len = left_len(input_len);
+ size_t left_input_len = left_subtree_len(input_len);
size_t right_input_len = input_len - left_input_len;
const uint8_t *right_input = &input[left_input_len];
uint64_t right_chunk_counter =
}
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
- // Recurse! If this implementation adds multi-threading support in the
- // future, this is where it will go.
- size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
- chunk_counter, flags, cv_array);
- size_t right_n = blake3_compress_subtree_wide(
- right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+ // Recurse!
+ size_t left_n = -1;
+ size_t right_n = -1;
+
+#if defined(BLAKE3_USE_TBB)
+ blake3_compress_subtree_wide_join_tbb(
+ key, flags, use_tbb,
+ // left-hand side
+ input, left_input_len, chunk_counter, cv_array, &left_n,
+ // right-hand side
+ right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
+#else
+ left_n = blake3_compress_subtree_wide(
+ input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
+ right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
+ right_chunk_counter, flags, right_cvs,
+ use_tbb);
+#endif // BLAKE3_USE_TBB
// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
//
// As with compress_subtree_wide(), this function is not used on inputs of 1
// chunk or less. That's a different codepath.
-INLINE void compress_subtree_to_parent_node(
- const uint8_t *input, size_t input_len, const uint32_t key[8],
- uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
+INLINE void
+compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
+ const uint32_t key[8], uint64_t chunk_counter,
+ uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
+ bool use_tbb) {
#if defined(BLAKE3_TESTING)
assert(input_len > BLAKE3_CHUNK_LEN);
#endif
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
- chunk_counter, flags, cv_array);
+ chunk_counter, flags, cv_array, use_tbb);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
self->cv_stack_len += 1;
}
-void blake3_hasher_update(blake3_hasher *self, const void *input,
- size_t input_len) {
+INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
+ size_t input_len, bool use_tbb) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
self->chunk.chunk_counter,
- self->chunk.flags, cv_pair);
+ self->chunk.flags, cv_pair, use_tbb);
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
self->chunk.chunk_counter + (subtree_chunks / 2));
}
}
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+ size_t input_len) {
+ bool use_tbb = false;
+ blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+
+#if defined(BLAKE3_USE_TBB)
+void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
+ size_t input_len) {
+ bool use_tbb = true;
+ blake3_hasher_update_base(self, input, input_len, use_tbb);
+}
+#endif // BLAKE3_USE_TBB
+
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len) {
blake3_hasher_finalize_seek(self, 0, out, out_len);
#include "blake3.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
// internal flags
enum blake3_flags {
CHUNK_START = 1 << 0,
#define INLINE static inline __attribute__((always_inline))
#endif
+#ifdef __cplusplus
+#define NOEXCEPT noexcept
+#else
+#define NOEXCEPT
+#endif
+
#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
#define IS_X86
#define IS_X86_64
size_t blake3_simd_degree(void);
+BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
+ const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags,
+ uint8_t *out, bool use_tbb);
+
+#if defined(BLAKE3_USE_TBB)
+BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb(
+ // shared params
+ const uint32_t key[8], uint8_t flags, bool use_tbb,
+ // left-hand side params
+ const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
+ uint8_t *l_cvs, size_t *l_n,
+ // right-hand side params
+ const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
+ uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
+#endif
// Declarations for implementation-specific functions.
void blake3_compress_in_place_portable(uint32_t cv[8],
uint8_t flags_end, uint8_t *out);
#endif
+#ifdef __cplusplus
+}
+#endif
#endif /* BLAKE3_IMPL_H */