=== src/third_party/blake3/blake3_*
-This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.0.0 with
+This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.2.0 with
the following license:
----
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array);
+ assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
- while (num_cvs > 2) {
+ // The second half of this loop condition is always true, and we just
+ // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
+ // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
+ // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
+ // this code, test it against that version.
+ while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
extern "C" {
#endif
-#define BLAKE3_VERSION_STRING "1.0.0"
+#define BLAKE3_VERSION_STRING "1.2.0"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
#endif
#endif
-#if defined(BLAKE3_USE_NEON)
+#if BLAKE3_USE_NEON == 1
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
return;
}
#endif
#endif
-#if defined(BLAKE3_USE_NEON)
+#if BLAKE3_USE_NEON == 1
return 4;
#endif
return 1;
#define IS_X86_32
#endif
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define IS_AARCH64
+#endif
+
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#include <immintrin.h>
#endif
+#if !defined(BLAKE3_USE_NEON)
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
+ #if defined(IS_AARCH64)
+ #define BLAKE3_USE_NEON 1
+ #else
+ #define BLAKE3_USE_NEON 0
+ #endif
+#endif
+
#if defined(IS_X86)
#define MAX_SIMD_DEGREE 16
-#elif defined(BLAKE3_USE_NEON)
+#elif BLAKE3_USE_NEON == 1
#define MAX_SIMD_DEGREE 4
#else
#define MAX_SIMD_DEGREE 1
#endif
#endif
-#if defined(BLAKE3_USE_NEON)
+#if BLAKE3_USE_NEON == 1
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
#include <arm_neon.h>
-// TODO: This is probably incorrect for big-endian ARM. How should that work?
+#ifdef __ARM_BIG_ENDIAN
+#error "This implementation only supports little-endian ARM."
+// It might be that all we need for big-endian support here is to get the loads
+// and stores right, but step zero would be finding a way to test it in CI.
+#endif
+
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
// vld1q_u32 has alignment requirements. Don't use it.
uint32x4_t x;
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5