# testing a builtin takes a little more work
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
+CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
if (NOT WIN32)
set(C_FLAGS_TO_CHECK
/* Define to 1 if you have the `_aligned_malloc' function. */
#cmakedefine HAVE__ALIGNED_MALLOC
+/* Define if compiler has __builtin_constant_p */
+#cmakedefine HAVE__BUILTIN_CONSTANT_P
+
/* Optimize, inline critical functions */
#cmakedefine HS_OPTIMIZE
#endif
}
-#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm_slli_epi64(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm_sll_epi64(a, x);
+}
+
#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
****/
#if defined(HAVE_AVX2)
-#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
+
+static really_really_inline
+m256 lshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm256_slli_epi64(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm256_sll_epi64(a, x);
+}
+
#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
static really_inline
#else
-static really_inline
+static really_really_inline
m256 lshift64_m256(m256 a, int b) {
m256 rv = a;
rv.lo = lshift64_m128(rv.lo, b);
return rv;
}
-// The shift amount is an immediate
static really_really_inline
m384 lshift64_m384(m384 a, unsigned b) {
m384 rv;
}
#if defined(HAVE_AVX512)
-#define lshift64_m512(a, b) _mm512_slli_epi64((a), b)
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm512_slli_epi64(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm512_sll_epi64(a, x);
+}
#else
-// The shift amount is an immediate
static really_really_inline
m512 lshift64_m512(m512 a, unsigned b) {
m512 rv;
void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); }
void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); }
void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); }
+m128 simd_lshift64(const m128 &a, unsigned i) { return lshift64_m128(a, i); }
+m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
+m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
+m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
template<typename T>
class SimdUtilsTest : public testing::Test {
}
}
+TYPED_TEST(SimdUtilsTest, lshift64) {
+ TypeParam a;
+ memset(&a, 0x5a, sizeof(a));
+
+ static constexpr u64a exp_val = 0x5a5a5a5a5a5a5a5aULL;
+
+ union {
+ TypeParam simd;
+ u64a qword[sizeof(TypeParam) / 8];
+ } c;
+ cout << "non-const for size " << sizeof(a) << '\n';
+ for (unsigned s = 0; s < 64; s++) {
+ c.simd = simd_lshift64(a, s);
+
+ const u64a expected = exp_val << s;
+ for (size_t i = 0; i < sizeof(c) / 8; i++) {
+ EXPECT_EQ(expected, c.qword[i]);
+ }
+ }
+
+ // test immediates
+ u64a expected;
+
+ cout << "imm for size " << sizeof(a) << '\n';
+ c.simd = simd_lshift64(a, 1);
+ expected = exp_val << 1;
+ for (size_t i = 0; i < sizeof(c) / 8; i++) {
+ EXPECT_EQ(expected, c.qword[i]);
+ }
+
+ c.simd = simd_lshift64(a, 2);
+ expected = exp_val << 2;
+ for (size_t i = 0; i < sizeof(c) / 8; i++) {
+ EXPECT_EQ(expected, c.qword[i]);
+ }
+
+ c.simd = simd_lshift64(a, 7);
+ expected = exp_val << 7;
+ for (size_t i = 0; i < sizeof(c) / 8; i++) {
+ EXPECT_EQ(expected, c.qword[i]);
+ }
+
+ c.simd = simd_lshift64(a, 31);
+ expected = exp_val << 31;
+ for (size_t i = 0; i < sizeof(c) / 8; i++) {
+ EXPECT_EQ(expected, c.qword[i]);
+ }
+}
+
+
TEST(SimdUtilsTest, alignment) {
ASSERT_EQ(16, alignof(m128));
ASSERT_EQ(32, alignof(m256));