From: Joel Rosdahl Date: Sat, 6 Nov 2021 07:54:46 +0000 (+0100) Subject: bump: Upgrade to BLAKE3 1.2.0 X-Git-Tag: v4.5~14 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=10bfc8264bc7b3e161edc25d47a6e967fa85d349;p=thirdparty%2Fccache.git bump: Upgrade to BLAKE3 1.2.0 --- diff --git a/LICENSE.adoc b/LICENSE.adoc index 2bf092a8f..431697aa8 100644 --- a/LICENSE.adoc +++ b/LICENSE.adoc @@ -72,7 +72,7 @@ along with this program. If not, see . === src/third_party/blake3/blake3_* -This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.0.0 with +This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.2.0 with the following license: ---- diff --git a/src/third_party/blake3/blake3.c b/src/third_party/blake3/blake3.c index 9998f75c7..20b6bd04c 100644 --- a/src/third_party/blake3/blake3.c +++ b/src/third_party/blake3/blake3.c @@ -340,12 +340,18 @@ INLINE void compress_subtree_to_parent_node( uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - while (num_cvs > 2) { + // The second half of this loop condition is always true, and we just + // asserted it above. But GCC can't tell that it's always true, and if NDEBUG + // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious + // warnings here. GCC 8.5 is particularly sensitive, so if you're changing + // this code, test it against that version. + while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); diff --git a/src/third_party/blake3/blake3.h b/src/third_party/blake3/blake3.h index 5eacf1801..c3ca22da3 100644 --- a/src/third_party/blake3/blake3.h +++ b/src/third_party/blake3/blake3.h @@ -8,7 +8,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.0.0" +#define BLAKE3_VERSION_STRING "1.2.0" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/src/third_party/blake3/blake3_dispatch.c b/src/third_party/blake3/blake3_dispatch.c index 6518478e5..b49805897 100644 --- a/src/third_party/blake3/blake3_dispatch.c +++ b/src/third_party/blake3/blake3_dispatch.c @@ -232,7 +232,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, #endif #endif -#if defined(BLAKE3_USE_NEON) +#if BLAKE3_USE_NEON == 1 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); return; @@ -269,7 +269,7 @@ size_t blake3_simd_degree(void) { } #endif #endif -#if defined(BLAKE3_USE_NEON) +#if BLAKE3_USE_NEON == 1 return 4; #endif return 1; diff --git a/src/third_party/blake3/blake3_impl.h b/src/third_party/blake3/blake3_impl.h index 86ab6aa25..ba2e91c64 100644 --- a/src/third_party/blake3/blake3_impl.h +++ b/src/third_party/blake3/blake3_impl.h @@ -38,6 +38,10 @@ enum blake3_flags { #define IS_X86_32 #endif +#if defined(__aarch64__) || defined(_M_ARM64) +#define IS_AARCH64 +#endif + #if defined(IS_X86) #if defined(_MSC_VER) #include @@ -45,9 +49,18 @@ enum blake3_flags { #include #endif +#if !defined(BLAKE3_USE_NEON) + // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness + #if defined(IS_AARCH64) + #define BLAKE3_USE_NEON 1 + #else + #define BLAKE3_USE_NEON 0 + #endif +#endif + #if defined(IS_X86) #define MAX_SIMD_DEGREE 16 -#elif defined(BLAKE3_USE_NEON) +#elif BLAKE3_USE_NEON == 1 #define MAX_SIMD_DEGREE 4 #else #define MAX_SIMD_DEGREE 1 @@ -257,7 +270,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, #endif #endif -#if defined(BLAKE3_USE_NEON) +#if BLAKE3_USE_NEON == 1 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, diff --git a/src/third_party/blake3/blake3_neon.c b/src/third_party/blake3/blake3_neon.c index 46691f526..a6f6da921 100644 --- a/src/third_party/blake3/blake3_neon.c +++ b/src/third_party/blake3/blake3_neon.c @@ -2,7 +2,12 @@ #include -// TODO: This is probably incorrect for big-endian ARM. How should that work? +#ifdef __ARM_BIG_ENDIAN +#error "This implementation only supports little-endian ARM." +// It might be that all we need for big-endian support here is to get the loads +// and stores right, but step zero would be finding a way to test it in CI. +#endif + INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. uint32x4_t x; diff --git a/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S b/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S index 424b4f85e..8852ba597 100644 --- a/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S +++ b/src/third_party/blake3/blake3_sse2_x86-64_windows_gnu.S @@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2: por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2: por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 diff --git a/src/third_party/blake3/blake3_sse2_x86-64_windows_msvc.asm b/src/third_party/blake3/blake3_sse2_x86-64_windows_msvc.asm index ff9bb4d9c..507502f11 100644 --- a/src/third_party/blake3/blake3_sse2_x86-64_windows_msvc.asm +++ b/src/third_party/blake3/blake3_sse2_x86-64_windows_msvc.asm @@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5