sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
#define PREPARE_MASKS_1 \
- dup_mask[0] = set4x128(maskBase[0]); \
- dup_mask[1] = set4x128(maskBase[1]);
+ dup_mask[0] = set1_4x128(maskBase[0]); \
+ dup_mask[1] = set1_4x128(maskBase[1]);
#define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \
- dup_mask[2] = set4x128(maskBase[2]); \
- dup_mask[3] = set4x128(maskBase[3]);
+ dup_mask[2] = set1_4x128(maskBase[2]); \
+ dup_mask[3] = set1_4x128(maskBase[3]);
#define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \
- dup_mask[4] = set4x128(maskBase[4]); \
- dup_mask[5] = set4x128(maskBase[5]);
+ dup_mask[4] = set1_4x128(maskBase[4]); \
+ dup_mask[5] = set1_4x128(maskBase[5]);
#define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \
- dup_mask[6] = set4x128(maskBase[6]); \
- dup_mask[7] = set4x128(maskBase[7]);
+ dup_mask[6] = set1_4x128(maskBase[6]); \
+ dup_mask[7] = set1_4x128(maskBase[7]);
#define PREPARE_MASKS(n) \
- m512 lo_mask = set64x8(0xf); \
+ m512 lo_mask = set1_64x8(0xf); \
m512 dup_mask[n * 2]; \
m512 sl_msk[n - 1]; \
PREPARE_MASKS_##n \
&c_0, &c_16, &c_32, &c_48)
#define PREPARE_MASKS_1 \
- dup_mask[0] = set4x128(maskBase[0]); \
- dup_mask[1] = set4x128(maskBase[1]);
+ dup_mask[0] = set1_4x128(maskBase[0]); \
+ dup_mask[1] = set1_4x128(maskBase[1]);
#define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \
- dup_mask[2] = set4x128(maskBase[2]); \
- dup_mask[3] = set4x128(maskBase[3]);
+ dup_mask[2] = set1_4x128(maskBase[2]); \
+ dup_mask[3] = set1_4x128(maskBase[3]);
#define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \
- dup_mask[4] = set4x128(maskBase[4]); \
- dup_mask[5] = set4x128(maskBase[5]);
+ dup_mask[4] = set1_4x128(maskBase[4]); \
+ dup_mask[5] = set1_4x128(maskBase[5]);
#define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \
- dup_mask[6] = set4x128(maskBase[6]); \
- dup_mask[7] = set4x128(maskBase[7]);
+ dup_mask[6] = set1_4x128(maskBase[6]); \
+ dup_mask[7] = set1_4x128(maskBase[7]);
#define PREPARE_MASKS(n) \
- m512 lo_mask = set64x8(0xf); \
+ m512 lo_mask = set1_64x8(0xf); \
m512 dup_mask[n * 2]; \
PREPARE_MASKS_##n
#define PREP_SHUF_MASK \
PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \
*c_128 = *(ptr + 15); \
- m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+ m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
*c_0 = *(ptr + 31)
#define SHIFT_OR_M1 \
prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
#define PREPARE_MASKS_1 \
- dup_mask[0] = set2x128(maskBase[0]); \
- dup_mask[1] = set2x128(maskBase[1]);
+ dup_mask[0] = set1_2x128(maskBase[0]); \
+ dup_mask[1] = set1_2x128(maskBase[1]);
#define PREPARE_MASKS_2 \
PREPARE_MASKS_1 \
- dup_mask[2] = set2x128(maskBase[2]); \
- dup_mask[3] = set2x128(maskBase[3]);
+ dup_mask[2] = set1_2x128(maskBase[2]); \
+ dup_mask[3] = set1_2x128(maskBase[3]);
#define PREPARE_MASKS_3 \
PREPARE_MASKS_2 \
- dup_mask[4] = set2x128(maskBase[4]); \
- dup_mask[5] = set2x128(maskBase[5]);
+ dup_mask[4] = set1_2x128(maskBase[4]); \
+ dup_mask[5] = set1_2x128(maskBase[5]);
#define PREPARE_MASKS_4 \
PREPARE_MASKS_3 \
- dup_mask[6] = set2x128(maskBase[6]); \
- dup_mask[7] = set2x128(maskBase[7]);
+ dup_mask[6] = set1_2x128(maskBase[6]); \
+ dup_mask[7] = set1_2x128(maskBase[7]);
#define PREPARE_MASKS(n) \
- m256 lo_mask = set32x8(0xf); \
+ m256 lo_mask = set1_32x8(0xf); \
m256 dup_mask[n * 2]; \
PREPARE_MASKS_##n
static really_inline
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
- m128 mask = set16x8(0xf);
+ m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
return or128(pshufb_m128(maskBase[0 * 2], lo),
static really_inline
m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
- m128 mask = set16x8(0xf);
+ m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m1(maskBase, val);
static really_inline
m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 val) {
- m128 mask = set16x8(0xf);
+ m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
static really_inline
m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
m128 *old_3, m128 val) {
- m128 mask = set16x8(0xf);
+ m128 mask = set1_16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
- m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+ m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
buf_history, len_history, nMasks));
- *p_mask = set2x128(p_mask128);
+ *p_mask = set1_2x128(p_mask128);
return ret;
}
static really_inline
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
- m256 mask = set32x8(0xf);
+ m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
return or256(pshufb_m256(maskBase[0 * 2], lo),
static really_inline
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
- m256 mask = set32x8(0xf);
+ m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
static really_inline
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 val) {
- m256 mask = set32x8(0xf);
+ m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
static really_inline
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
m256 *old_3, m256 val) {
- m256 mask = set32x8(0xf);
+ m256 mask = set1_32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
static really_inline m256 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
- return set32x8(k);
+ return set1_32x8(k);
}
static really_inline m256 getCaseMask(void) {
- return set32x8(0xdf);
+ return set1_32x8(0xdf);
}
static really_inline
static really_inline m128 getMask(u8 c, bool noCase) {
u8 k = caseClear8(c, noCase);
- return set16x8(k);
+ return set1_16x8(k);
}
static really_inline m128 getCaseMask(void) {
- return set16x8(0xdf);
+ return set1_16x8(0xdf);
}
static really_inline
if (len) {
m128 ss_char = load128(sherman_state);
- m128 cur_char = set16x8(cprime);
+ m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char));
if (len) {
m128 ss_char = load128(sherman_state);
- m128 cur_char = set16x8(cprime);
+ m128 cur_char = set1_16x8(cprime);
u32 z = movemask128(eq128(ss_char, cur_char));
assert(s_in); /* should not already be dead */
assert(soft_c_end <= hard_c_end);
DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
- m128 s = set16x8(s_in - 1);
+ m128 s = set1_16x8(s_in - 1);
const u8 *c = *c_inout;
const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
if (!do_accel) {
#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
u32 sheng_limit_x4 = sheng_limit * 0x01010101;
- m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
- m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
+ m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
+ m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
m->sheng_accel_limit, sheng_stop_limit);
#endif
}
DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
- m128 cur_state = set16x8(*state);
+ m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks;
while (likely(cur_buf != end)) {
return MO_CONTINUE_MATCHING;
}
- m128 cur_state = set16x8(*state);
+ m128 cur_state = set1_16x8(*state);
const m128 *masks = s->shuffle_masks;
while (likely(end - cur_buf >= 4)) {
}
const m128 zeroes = zeroes128();
- const m128 low4bits = _mm_set1_epi8(0xf);
+ const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
}
const m128 zeroes = zeroes128();
- const m128 low4bits = _mm_set1_epi8(0xf);
+ const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
assert(buf_end - buf >= 16);
m128 mask2_lo, m128 mask2_hi,
const u8 *buf, const u8 *buf_end) {
const m128 ones = ones128();
- const m128 low4bits = _mm_set1_epi8(0xf);
+ const m128 low4bits = set1_16x8(0xf);
const u8 *rv;
size_t min = (size_t)buf % 16;
buf, buf_end);
}
- const m256 low4bits = set32x8(0xf);
+ const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) {
return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
}
const m256 zeroes = zeroes256();
- const m256 wide_mask_lo = set2x128(mask_lo);
- const m256 wide_mask_hi = set2x128(mask_hi);
+ const m256 wide_mask_lo = set1_2x128(mask_lo);
+ const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;
buf, buf_end);
}
- const m256 low4bits = set32x8(0xf);
+ const m256 low4bits = set1_32x8(0xf);
if (buf_end - buf <= 32) {
return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
}
const m256 zeroes = zeroes256();
- const m256 wide_mask_lo = set2x128(mask_lo);
- const m256 wide_mask_hi = set2x128(mask_hi);
+ const m256 wide_mask_lo = set1_2x128(mask_lo);
+ const m256 wide_mask_hi = set1_2x128(mask_hi);
const u8 *rv;
assert(buf_end - buf >= 32);
const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
- const m256 low4bits = set32x8(0xf);
+ const m256 low4bits = set1_32x8(0xf);
// run shufti over two overlapping 16-byte unaligned reads
const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
}
const m256 ones = ones256();
- const m256 low4bits = set32x8(0xf);
- const m256 wide_mask1_lo = set2x128(mask1_lo);
- const m256 wide_mask1_hi = set2x128(mask1_hi);
- const m256 wide_mask2_lo = set2x128(mask2_lo);
- const m256 wide_mask2_hi = set2x128(mask2_hi);
+ const m256 low4bits = set1_32x8(0xf);
+ const m256 wide_mask1_lo = set1_2x128(mask1_lo);
+ const m256 wide_mask1_hi = set1_2x128(mask1_hi);
+ const m256 wide_mask2_lo = set1_2x128(mask2_lo);
+ const m256 wide_mask2_hi = set1_2x128(mask2_hi);
const u8 *rv;
size_t min = (size_t)buf % 32;
static really_inline
u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
- m128 highconst = _mm_set1_epi8(0x80);
- m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
+ m128 highconst = set1_16x8(0x80);
+ m128 shuf_mask_hi = set1_2x64(0x8040201008040201);
// and now do the real work
m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
static really_inline
u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
- m256 highconst = _mm256_set1_epi8(0x80);
- m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
+ m256 highconst = set1_32x8(0x80);
+ m256 shuf_mask_hi = set1_4x64(0x8040201008040201);
// and now do the real work
m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) {
DEBUG_PRINTF("len %zu\n", buf_end - buf);
- const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
- const m256 wide_set = set2x128(shuf_mask_lo_highset);
+ const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
+ const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end);
assert(buf < buf_end);
const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
m128 shuf_mask_lo_highset,
const u8 *buf, const u8 *buf_end) {
- const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
- const m256 wide_set = set2x128(shuf_mask_lo_highset);
+ const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
+ const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
assert(buf && buf_end);
assert(buf < buf_end);
const u8 *rv;
#define VERM_BOUNDARY 16
#define VERM_TYPE m128
-#define VERM_SET_FN set16x8
+#define VERM_SET_FN set1_16x8
static really_inline
const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) {
assert((size_t)buf % 16 == 0);
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 31 < buf_end; buf += 32) {
m128 data = load128(buf);
// returns NULL if not found
static really_inline
const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) {
const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) {
assert((size_t)buf % 16 == 0);
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf);
static really_inline
const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars1, v),
const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
const u8 *buf_end, char negate) {
assert((size_t)buf_end % 16 == 0);
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 15 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
// returns NULL if not found
static really_inline
const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(eq128(chars, and128(casemask, data)));
if (negate) {
const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
const u8 *buf, const u8 *buf_end) {
assert((size_t)buf_end % 16 == 0);
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
for (; buf + 16 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
static really_inline
const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
/* due to laziness, nonalphas and nocase having interesting behaviour */
- m128 casemask = set16x8(CASE_CLEAR);
+ m128 casemask = set1_16x8(CASE_CLEAR);
m128 data = loadu128(buf);
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars2, v),
#define VERM_BOUNDARY 64
#define VERM_TYPE m512
-#define VERM_SET_FN set64x8
+#define VERM_SET_FN set1_64x8
static really_inline
const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
u32 count = *count_inout;
- m128 chars = set16x8(c);
+ m128 chars = set1_16x8(c);
for (; d + 16 <= d_end; d_end -= 16) {
m128 data = loadu128(d_end - 16);
u32 count = *count_inout;
const m128 zeroes = zeroes128();
- const m128 low4bits = _mm_set1_epi8(0xf);
+ const m128 low4bits = set1_16x8(0xf);
for (; d + 16 <= d_end; d_end -= 16) {
m128 data = loadu128(d_end - 16);
return 1;
}
- m256 data_m256 = set2x128(data);
+ m256 data_m256 = set1_2x128(data);
m256 hi_mask_m256 = loadu256(hi_mask);
m256 lo_mask_m256 = loadu256(lo_mask);
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
m128 hi_mask_m128 = loadu128(hi_mask);
m128 lo_mask_m128 = loadu128(lo_mask);
- m256 hi_mask_m256 = set2x128(hi_mask_m128);
- m256 lo_mask_m256 = set2x128(lo_mask_m128);
+ m256 hi_mask_m256 = set1_2x128(hi_mask_m128);
+ m256 lo_mask_m256 = set1_2x128(lo_mask_m128);
m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
bucket_select_mask_m256,
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
- expand_valid = set64x2(valid_hi, valid_lo);
+ expand_valid = set2x64(valid_hi, valid_lo);
valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
data_select_mask));
}
u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
- m256 data_double = set2x128(data_m128);
+ m256 data_double = set1_2x128(data_m128);
m256 data_select_mask = loadu256(ri->data_select_mask);
u32 valid_path_mask = 0;
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
- expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+ expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo);
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
data_select_mask));
u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
- m256 data_double = set2x128(data_m128);
+ m256 data_double = set1_2x128(data_m128);
m256 data_select_mask = loadu256(ri->data_select_mask);
u32 valid_path_mask = 0;
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
- expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+ expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo);
valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
data_select_mask));
u32 valid_data_mask;
m128 data_m128 = getData128(ci, offset, &valid_data_mask);
- m256 data_m256 = set2x128(data_m128);
+ m256 data_m256 = set1_2x128(data_m128);
m256 data_select_mask_1 = loadu256(ri->data_select_mask);
m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
- expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+ expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
valid_lo);
u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
data_select_mask_1));
int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
const m256 lo_mask, const m256 and_mask,
const u32 neg_mask, const u32 valid_data_mask) {
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
m256 c_hi = pshufb_m256(hi_mask,
rshift64_m256(andnot256(low4bits, data), 4));
const m128 and_mask, const u32 neg_mask,
const u32 valid_data_mask) {
m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
m128 nresult = eq128(and128(t, and_mask), zeroes128());
int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
const m256 lo_mask, const m256 and_mask,
const u32 neg_mask, const u32 valid_data_mask) {
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
m256 c_hi = pshufb_m256(hi_mask,
rshift64_m256(andnot256(low4bits, data), 4));
const m256 bucket_mask_hi,
const m256 bucket_mask_lo, const u32 neg_mask,
const u32 valid_data_mask) {
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
const u32 neg_mask,
const u32 valid_path_mask) {
m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
m128 result = and128(t, bucket_select_mask);
const u32 hi_bits, const u32 lo_bits,
const u32 neg_mask,
const u32 valid_path_mask) {
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo = pshufb_m256(lo_mask, data_lo);
const u32 hi_bits, const u32 lo_bits,
const u32 neg_mask,
const u32 valid_path_mask) {
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 data_lo = and256(data, low4bits);
m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
const u64a hi_bits, const u64a lo_bits,
const u64a neg_mask,
const u64a valid_path_mask) {
- m256 low4bits = set32x8(0xf);
+ m256 low4bits = set1_32x8(0xf);
m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
m256 c_hi_1 = pshufb_m256(hi_mask,
u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
expand32(v[2], m[2]), expand32(v[3], m[3]) };
- return _mm_set_epi32(x[3], x[2], x[1], x[0]);
+ return set32x4(x[3], x[2], x[1], x[0]);
}
#endif
static really_inline
m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
// First, decompose our vectors into 64-bit chunks.
- u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
+ u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
u64a v[2];
u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
- return _mm_set_epi64x(x[1], x[0]);
+ return set2x64(x[1], x[0]);
}
#endif
expand32(v[6], m[6]), expand32(v[7], m[7]) };
#if !defined(HAVE_AVX2)
- m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
- .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
+ m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+ .hi = set32x4(x[7], x[6], x[5], x[4]) };
#else
- m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
- x[3], x[2], x[1], x[0]);
+ m256 xvec = set32x8(x[7], x[6], x[5], x[4],
+ x[3], x[2], x[1], x[0]);
#endif
return xvec;
}
expand64(v[2], m[2]), expand64(v[3], m[3]) };
#if !defined(HAVE_AVX2)
- m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
- .hi = _mm_set_epi64x(x[3], x[2]) };
+ m256 xvec = { .lo = set2x64(x[1], x[0]),
+ .hi = set2x64(x[3], x[2]) };
#else
- m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
+ m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
#endif
return xvec;
}
expand32(v[8], m[8]), expand32(v[9], m[9]),
expand32(v[10], m[10]), expand32(v[11], m[11]) };
- m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
- .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
- .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
+ m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+ .mid = set32x4(x[7], x[6], x[5], x[4]),
+ .hi = set32x4(x[11], x[10], x[9], x[8]) };
return xvec;
}
#endif
expand64(v[2], m[2]), expand64(v[3], m[3]),
expand64(v[4], m[4]), expand64(v[5], m[5]) };
- m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
- .mid = _mm_set_epi64x(x[3], x[2]),
- .hi = _mm_set_epi64x(x[5], x[4]) };
+ m384 xvec = { .lo = set2x64(x[1], x[0]),
+ .mid = set2x64(x[3], x[2]),
+ .hi = set2x64(x[5], x[4]) };
return xvec;
}
#endif
m512 xvec;
#if defined(HAVE_AVX512)
- xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
- x[11], x[10], x[9], x[8],
- x[7], x[6], x[5], x[4],
- x[3], x[2], x[1], x[0]);
+ xvec = set32x16(x[15], x[14], x[13], x[12],
+ x[11], x[10], x[9], x[8],
+ x[7], x[6], x[5], x[4],
+ x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2)
- xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
- x[3], x[2], x[1], x[0]);
- xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
- x[11], x[10], x[9], x[8]);
+ xvec.lo = set32x8(x[7], x[6], x[5], x[4],
+ x[3], x[2], x[1], x[0]);
+ xvec.hi = set32x8(x[15], x[14], x[13], x[12],
+ x[11], x[10], x[9], x[8]);
#else
- xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
- xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
- xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
- xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
+ xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]);
+ xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]);
+ xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]);
+ xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]);
#endif
return xvec;
}
expand64(v[6], m[6]), expand64(v[7], m[7]) };
#if defined(HAVE_AVX512)
- m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
+ m512 xvec = set64x8(x[7], x[6], x[5], x[4],
x[3], x[2], x[1], x[0]);
#elif defined(HAVE_AVX2)
- m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
- .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+ m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
+ .hi = set4x64(x[7], x[6], x[5], x[4])};
#else
- m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
- _mm_set_epi64x(x[3], x[2]) },
- .hi = { _mm_set_epi64x(x[5], x[4]),
- _mm_set_epi64x(x[7], x[6]) } };
+ m512 xvec = { .lo = { set2x64(x[1], x[0]),
+ set2x64(x[3], x[2]) },
+ .hi = { set2x64(x[5], x[4]),
+ set2x64(x[7], x[6]) } };
#endif
return xvec;
}