const m256 low4bits) {
// do the hi and lo shuffles in the one avx register
m256 c = set2x128(chars);
- c = _mm256_srlv_epi64(c, _mm256_set_epi64x(0, 0, 4, 4));
+ c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
c = and256(c, low4bits);
m256 c_shuf = vpshufb(mask, c);
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
const m256 low4bits) {
// do the hi and lo shuffles in the one avx register
m256 c = set2x128(chars);
- c = _mm256_srlv_epi64(c, _mm256_set_epi64x(0, 0, 4, 4));
+ c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
c = and256(c, low4bits);
m256 c_shuf = vpshufb(mask, c);
m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
const m256 low4bits) {
// do the hi and lo shuffles in the one avx register
m256 c = set2x128(chars);
- c = _mm256_srlv_epi64(c, _mm256_set_epi64x(0, 0, 4, 4));
+ c = _mm256_srlv_epi64(c, _mm256_set_epi64x(4, 4, 0, 0));
c = and256(c, low4bits);
m256 c_shuf1 = vpshufb(mask1, c);
m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);
neg_mask &= 0xffff;
array<u8, 32> nib_mask;
array<u8, 16> bucket_select_mask_16;
- copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin());
- copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin() + 16);
+ copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin());
+ copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
bucket_select_mask_16.begin());
auto ri = make_unique<RoseInstrCheckShufti16x8>
}
static really_inline
-m256 combine2x128(m128 a, m128 b) {
- m256 rv = {a, b};
+m256 combine2x128(m128 hi, m128 lo) {
+ m256 rv = {lo, hi};
return rv;
}
#if defined(_mm256_set_m128i)
return _mm256_set_m128i(hi, lo);
#else
- return insert128to256(cast128to256(hi), lo, 1);
+ return insert128to256(cast128to256(lo), hi, 1);
#endif
}
#endif //AVX2