From: Joel Rosdahl Date: Sun, 20 Feb 2022 15:10:11 +0000 (+0100) Subject: bump: Upgrade to BLAKE3 1.3.1 X-Git-Tag: v4.6~11 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6d1c3218a5e2c718d41353fb1dbb387ffc22024b;p=thirdparty%2Fccache.git bump: Upgrade to BLAKE3 1.3.1 --- diff --git a/LICENSE.adoc b/LICENSE.adoc index e1df2e8ff..efe3e6c76 100644 --- a/LICENSE.adoc +++ b/LICENSE.adoc @@ -72,7 +72,7 @@ along with this program. If not, see . === src/third_party/blake3/blake3_* -This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.2.0 with +This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.3.1 with the following license: ---- diff --git a/src/third_party/blake3/blake3.h b/src/third_party/blake3/blake3.h index 079f5657c..7caf9b4b5 100644 --- a/src/third_party/blake3/blake3.h +++ b/src/third_party/blake3/blake3.h @@ -8,7 +8,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.3.0" +#define BLAKE3_VERSION_STRING "1.3.1" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/src/third_party/blake3/blake3_avx2.c b/src/third_party/blake3/blake3_avx2.c index c5a2ce9e2..e76aa1a3a 100644 --- a/src/third_party/blake3/blake3_avx2.c +++ b/src/third_party/blake3/blake3_avx2.c @@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[8]); @@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter, const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); const __m256i add1 = _mm256_and_si256(mask, add0); - __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1); __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); - __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } +static void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, diff --git a/src/third_party/blake3/blake3_avx512.c b/src/third_party/blake3/blake3_avx512.c index 77a5c385c..9c35b08c4 100644 --- a/src/third_party/blake3/blake3_avx512.c +++ b/src/third_party/blake3/blake3_avx512.c @@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_128(&out[0]); transpose_vecs_128(&out[4]); @@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter, *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); } +static void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, @@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); for (size_t i = 0; i < 8; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_256(&out[0]); transpose_vecs_256(&out[8]); @@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter, *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); } +static void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, @@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, out[14] = loadu_512(&inputs[14][block_offset]); out[15] = loadu_512(&inputs[15][block_offset]); for (size_t i = 0; i < 16; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs_512(out); } @@ -1047,13 +1049,14 @@ INLINE void load_counters16(uint64_t counter, bool increment_counter, const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); const __m512i add1 = _mm512_and_si512(mask, add0); - __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); + __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1); __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); - __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); + __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1)); *out_lo = l; *out_hi = h; } +static void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, diff --git a/src/third_party/blake3/blake3_impl.h b/src/third_party/blake3/blake3_impl.h index ba2e91c64..cc5672f22 100644 --- a/src/third_party/blake3/blake3_impl.h +++ b/src/third_party/blake3/blake3_impl.h @@ -96,11 +96,11 @@ static unsigned int highest_one(uint64_t x) { #elif defined(_MSC_VER) && defined(IS_X86_32) if(x >> 32) { unsigned long index; - _BitScanReverse(&index, x >> 32); + _BitScanReverse(&index, (unsigned long)(x >> 32)); return 32 + index; } else { unsigned long index; - _BitScanReverse(&index, x); + _BitScanReverse(&index, (unsigned long)x); return index; } #else diff --git a/src/third_party/blake3/blake3_sse2.c b/src/third_party/blake3/blake3_sse2.c index 159296688..f4449ac0b 100644 --- a/src/third_party/blake3/blake3_sse2.c +++ b/src/third_party/blake3/blake3_sse2.c @@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } -INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) { +INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); __m128i mask = _mm_set1_epi16(imm8); mask = _mm_and_si128(mask, bits); @@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[4]); @@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter, const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } +static void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, diff --git a/src/third_party/blake3/blake3_sse41.c b/src/third_party/blake3/blake3_sse41.c index b31122533..87a8dae15 100644 --- a/src/third_party/blake3/blake3_sse41.c +++ b/src/third_party/blake3/blake3_sse41.c @@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); for (size_t i = 0; i < 4; ++i) { - _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); } transpose_vecs(&out[0]); transpose_vecs(&out[4]); @@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter, const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); const __m128i add1 = _mm_and_si128(mask, add0); - __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); - __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); *out_lo = l; *out_hi = h; } +static void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags,