}
INLINE uint32x4_t rot16_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+ // The straightfoward implementation would be two shifts and an or, but that's
+ // slower on microarchitectures we've tested. See
+ // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+ // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+ return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
}
INLINE uint32x4_t rot12_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+ return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
}
INLINE uint32x4_t rot8_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+ return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+ static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+ return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else
+ return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
}
INLINE uint32x4_t rot7_128(uint32x4_t x) {
- return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+ return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
}
// TODO: compress_neon