size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+ // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+ // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+ // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+ // set then it emits incorrect warnings here. We tried a few different
+ // hacks to silence these, but in the end our hacks just produced different
+ // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+ // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+ // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
- // The second half of this loop condition is always true, and we just
- // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
- // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
- // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
- // this code, test it against that version.
- while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+ while (num_cvs > 2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
+#endif
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
// vld1q_u32 has alignment requirements. Don't use it.
- uint32x4_t x;
- memcpy(&x, src, 16);
- return x;
+ return vreinterpretq_u32_u8(vld1q_u8(src));
}
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
// vst1q_u32 has alignment requirements. Don't use it.
- memcpy(dest, &src, 16);
+ vst1q_u8(dest, vreinterpretq_u8_u32(src));
}
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {