#if !defined(HAVE_SIMD_256_BITS)
static really_really_inline
-m256 lshift64_m256(m256 a, int b) {
+m256 lshift64_m256(m256 a, unsigned b) {
m256 rv = a;
rv.lo = lshift64_m128(rv.lo, b);
rv.hi = lshift64_m128(rv.hi, b);
}
static really_inline
-m256 rshift64_m256(m256 a, int b) {
+m256 rshift64_m256(m256 a, unsigned b) {
m256 rv = a;
rv.lo = rshift64_m128(rv.lo, b);
rv.hi = rshift64_m128(rv.hi, b);
static really_inline
int isnonzero512(m512 a) {
- m256 x = or256(a.lo, a.lo);
- m256 y = or256(a.hi, a.hi);
- return isnonzero256(or256(x, y));
+ return isnonzero256(or256(a.lo, a.hi));
}
/**
// aligned load
static really_inline
m512 load512(const void *ptr) {
- assert(ISALIGNED_N(ptr, alignof(m256)));
+ assert(ISALIGNED_N(ptr, alignof(m512)));
// cppcheck-suppress cstyleCast
m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
return rv;
return _mm_sll_epi64(a, x);
}
-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+static really_really_inline
+m128 rshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm_srli_epi64(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm_srl_epi64(a, x);
+}
+
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define eq64_m128(a, b) _mm_cmpeq_epi64((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
return _mm256_sll_epi64(a, x);
}
-#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
+static really_really_inline
+m256 rshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm256_srli_epi64(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm256_srl_epi64(a, x);
+}
static really_inline m256 set1_4x64(u64a c) {
return _mm256_set1_epi64x(c);
return _mm512_sll_epi64(a, x);
}
-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+static really_really_inline
+m512 rshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm512_srli_epi64(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm512_srl_epi64(a, x);
+}
+
#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)