}
#if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand128(m128 a) {
+static really_inline m512 broadcast128(m128 a) {
return _mm512_broadcast_i32x4(a);
}
#endif
}
#if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand256(m256 a) {
+static really_inline m512 broadcast256(m256 a) {
return _mm512_broadcast_i64x4(a);
}
#endif
return _mm256_loadu_si256((const m256 *)ptr);
}
-static really_inline
+static really_really_inline
m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
return _mm256_maskz_loadu_epi8(k, ptr);
}
#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow64from256(a) movq(cast256to128(a))
#define extractlow32from256(a) movd(cast256to128(a))
#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
static really_inline u64a movq512(const m512 in) {
// NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
// so we use 2-step convertions to work around.
- return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
+ return movq(_mm512_castsi512_si128(in));
}
static really_inline
}
static really_inline
-m512 set16x32(u32 a) {
+m512 set1_16x32(u32 a) {
return _mm512_set1_epi32(a);
}
static really_inline
m512 swap256in512(m512 a) {
- m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+ m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
return vpermq512(idx, a);
}
static really_inline m512
add512(m512 a, m512 b) {
- return _mm512_add_epu64(a, b);
+ return _mm512_add_epi64(a, b);
}
static really_inline
}
#if defined(HAVE_AVX512VBMI)
-static really_inline m512 expand384(m384 a) {
+static really_inline m512 broadcast384(m384 a) {
u64a *lo = (u64a*)&a.lo;
u64a *mid = (u64a*)&a.mid;
u64a *hi = (u64a*)&a.hi;
#define or_m512(a, b) (or512(a, b))
#if defined(HAVE_AVX512VBMI)
-#define expand_m128(a) (expand128(a))
-#define expand_m256(a) (expand256(a))
-#define expand_m384(a) (expand384(a))
-#define expand_m512(a) (a)
+#define broadcast_m128(a) (broadcast128(a))
+#define broadcast_m256(a) (broadcast256(a))
+#define broadcast_m384(a) (broadcast384(a))
+#define broadcast_m512(a) (a)
#define shuffle_byte_m128(a, b) (pshufb_m512(b, a))
#define shuffle_byte_m256(a, b) (vpermb512(a, b))