bump: Upgrade to BLAKE3 1.4.1

author Joel Rosdahl <joel@rosdahl.net>

Fri, 7 Jul 2023 06:53:31 +0000 (08:53 +0200)

committer Joel Rosdahl <joel@rosdahl.net>

Fri, 7 Jul 2023 06:53:31 +0000 (08:53 +0200)
author Joel Rosdahl <joel@rosdahl.net>
Fri, 7 Jul 2023 06:53:31 +0000 (08:53 +0200)
committer Joel Rosdahl <joel@rosdahl.net>
Fri, 7 Jul 2023 06:53:31 +0000 (08:53 +0200)
diff --git a/LICENSE.adoc b/LICENSE.adoc

index ac41a071937930c05bf0d6df5ad051efba722362..ad274ed79aadf4999466fa2ca8bac69d6bb5cb6e 100644 (file)
--- a/LICENSE.adoc
+++ b/LICENSE.adoc
@@ -72,7 +72,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  
  === src/third_party/blake3/blake3_*
  
-This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.4.0 with the
+This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.4.1 with the
  following license:
  
  ----
diff --git a/src/third_party/blake3/blake3.c b/src/third_party/blake3/blake3.c

index dc343f91c51be90d28cc45fa10b3604582223b29..692f4b0216485a87b469456851da6d3e8a70533b 100644 (file)
--- a/src/third_party/blake3/blake3.c
+++ b/src/third_party/blake3/blake3.c
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
  // As a special case when the SIMD degree is 1, this function will still return
  // at least 2 outputs. This guarantees that this function doesn't perform the
  // root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable output.) Note that this function is
+// wouldn't be able to implement extendable output.) Note that this function is
  // not used when the whole input is only 1 chunk long; that's a different
  // codepath.
  //
diff --git a/src/third_party/blake3/blake3.h b/src/third_party/blake3/blake3.h

index aa4bfa6f352bea520404beaea20e0d2ca16cd76a..21e0d7b9de45ade52f3673d9e49cffa5b95b740a 100644 (file)
--- a/src/third_party/blake3/blake3.h
+++ b/src/third_party/blake3/blake3.h
@@ -30,7 +30,7 @@
  extern "C" {
  #endif
  
-#define BLAKE3_VERSION_STRING "1.4.0"
+#define BLAKE3_VERSION_STRING "1.4.1"
  #define BLAKE3_KEY_LEN 32
  #define BLAKE3_OUT_LEN 32
  #define BLAKE3_BLOCK_LEN 64
diff --git a/src/third_party/blake3/blake3_neon.c b/src/third_party/blake3/blake3_neon.c

index a6f6da921e197b158394f8c22ae4d365eee36c15..8a818fc78f7726de9361ac553f075b5fc8767aa3 100644 (file)
--- a/src/third_party/blake3/blake3_neon.c
+++ b/src/third_party/blake3/blake3_neon.c
@@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  }
  
  INLINE uint32x4_t rot16_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  // The straightfoward implementation would be two shifts and an or, but that's
+  // slower on microarchitectures we've tested. See
+  // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+  // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
  }
  
  INLINE uint32x4_t rot12_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
  }
  
  INLINE uint32x4_t rot8_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+  return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+  static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+  return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else 
+  return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
  }
  
  INLINE uint32x4_t rot7_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
  }
  
  // TODO: compress_neon
author	Joel Rosdahl <joel@rosdahl.net>
	Fri, 7 Jul 2023 06:53:31 +0000 (08:53 +0200)
committer	Joel Rosdahl <joel@rosdahl.net>
	Fri, 7 Jul 2023 06:53:31 +0000 (08:53 +0200)
LICENSE.adoc		patch \| blob \| blame \| history
src/third_party/blake3/blake3.c		patch \| blob \| blame \| history
src/third_party/blake3/blake3.h		patch \| blob \| blame \| history
src/third_party/blake3/blake3_neon.c		patch \| blob \| blame \| history