From: Joel Rosdahl Date: Fri, 7 Jul 2023 06:53:31 +0000 (+0200) Subject: bump: Upgrade to BLAKE3 1.4.1 X-Git-Tag: v4.9~152 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7fa0e72340698dd62553be86034e2bfe66f88a04;p=thirdparty%2Fccache.git bump: Upgrade to BLAKE3 1.4.1 --- diff --git a/LICENSE.adoc b/LICENSE.adoc index ac41a0719..ad274ed79 100644 --- a/LICENSE.adoc +++ b/LICENSE.adoc @@ -72,7 +72,7 @@ along with this program. If not, see . === src/third_party/blake3/blake3_* -This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.4.0 with the +This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.4.1 with the following license: ---- diff --git a/src/third_party/blake3/blake3.c b/src/third_party/blake3/blake3.c index dc343f91c..692f4b021 100644 --- a/src/third_party/blake3/blake3.c +++ b/src/third_party/blake3/blake3.c @@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable output.) Note that this function is +// wouldn't be able to implement extendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // diff --git a/src/third_party/blake3/blake3.h b/src/third_party/blake3/blake3.h index aa4bfa6f3..21e0d7b9d 100644 --- a/src/third_party/blake3/blake3.h +++ b/src/third_party/blake3/blake3.h @@ -30,7 +30,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.4.0" +#define BLAKE3_VERSION_STRING "1.4.1" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/src/third_party/blake3/blake3_neon.c b/src/third_party/blake3/blake3_neon.c index a6f6da921..8a818fc78 100644 --- a/src/third_party/blake3/blake3_neon.c +++ b/src/third_party/blake3/blake3_neon.c @@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { } INLINE uint32x4_t rot16_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + // The straightfoward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); } INLINE uint32x4_t rot12_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); } INLINE uint32x4_t rot8_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +#if defined(__clang__) + return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); +#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 + static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; + return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); +#else + return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); +#endif } INLINE uint32x4_t rot7_128(uint32x4_t x) { - return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); } // TODO: compress_neon