From: Joel Rosdahl <joel@rosdahl.net>
Date: Fri, 7 Jul 2023 06:53:31 +0000 (+0200)
Subject: bump: Upgrade to BLAKE3 1.4.1
X-Git-Tag: v4.9~152
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7fa0e72340698dd62553be86034e2bfe66f88a04;p=thirdparty%2Fccache.git

bump: Upgrade to BLAKE3 1.4.1
---

diff --git a/LICENSE.adoc b/LICENSE.adoc
index ac41a0719..ad274ed79 100644
--- a/LICENSE.adoc
+++ b/LICENSE.adoc
@@ -72,7 +72,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 === src/third_party/blake3/blake3_*
 
-This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.4.0 with the
+This is a subset of https://github.com/BLAKE3-team/BLAKE3[BLAKE3] 1.4.1 with the
 following license:
 
 ----
diff --git a/src/third_party/blake3/blake3.c b/src/third_party/blake3/blake3.c
index dc343f91c..692f4b021 100644
--- a/src/third_party/blake3/blake3.c
+++ b/src/third_party/blake3/blake3.c
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
 // As a special case when the SIMD degree is 1, this function will still return
 // at least 2 outputs. This guarantees that this function doesn't perform the
 // root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable output.) Note that this function is
+// wouldn't be able to implement extendable output.) Note that this function is
 // not used when the whole input is only 1 chunk long; that's a different
 // codepath.
 //
diff --git a/src/third_party/blake3/blake3.h b/src/third_party/blake3/blake3.h
index aa4bfa6f3..21e0d7b9d 100644
--- a/src/third_party/blake3/blake3.h
+++ b/src/third_party/blake3/blake3.h
@@ -30,7 +30,7 @@
 extern "C" {
 #endif
 
-#define BLAKE3_VERSION_STRING "1.4.0"
+#define BLAKE3_VERSION_STRING "1.4.1"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
diff --git a/src/third_party/blake3/blake3_neon.c b/src/third_party/blake3/blake3_neon.c
index a6f6da921..8a818fc78 100644
--- a/src/third_party/blake3/blake3_neon.c
+++ b/src/third_party/blake3/blake3_neon.c
@@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 }
 
 INLINE uint32x4_t rot16_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  // The straightfoward implementation would be two shifts and an or, but that's
+  // slower on microarchitectures we've tested. See
+  // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+  // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
 }
 
 INLINE uint32x4_t rot12_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
 }
 
 INLINE uint32x4_t rot8_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+  return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+  static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+  return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else 
+  return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
 }
 
 INLINE uint32x4_t rot7_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
 }
 
 // TODO: compress_neon