hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
Z_TYPE z, size_t len, const struct cb_info *cbi) {
while (unlikely(z)) {
- Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
+ Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
size_t matchPos = d - buf + pos;
DEBUG_PRINTF("match pos %zu\n", matchPos);
hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
Z_TYPE z, size_t len, const struct cb_info *cbi) {
while (unlikely(z)) {
- Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);
+ Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
size_t matchPos = d - buf + pos - 1;
DEBUG_PRINTF("match pos %zu\n", matchPos);
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
SuperVector<S> v = SuperVector<S>::Zeroes();
memcpy(&v.u, d, l);
- typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l);
+ typename SuperVector<S>::comparemask_type mask =
+ SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
v = v & caseMask;
- typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
+ typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
+ z = SuperVector<S>::iteration_mask(z);
return single_zscan(n, d, buf, z, len, cbi);
}
return HWLM_SUCCESS;
}
size_t buf_off = start - offset;
- typename SuperVector<S>::movemask_type mask = SINGLE_LOAD_MASK(l) << buf_off;
+ typename SuperVector<S>::comparemask_type mask =
+ SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
+ << (buf_off * SuperVector<S>::mask_width());
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
- typename SuperVector<S>::movemask_type z = mask & mask1.eqmask(v);
+ typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
+ z = SuperVector<S>::iteration_mask(z);
return single_zscan(n, d, buf, z, len, cbi);
}
memcpy(&v.u, d, l);
v = v & caseMask;
- typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l);
- typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
- typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
- typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+ typename SuperVector<S>::comparemask_type mask =
+ DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+ typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+ typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+ typename SuperVector<S>::comparemask_type z =
+ mask & (z1 << (SuperVector<S>::mask_width())) & z2;
+ z = SuperVector<S>::iteration_mask(z);
return double_zscan(n, d, buf, z, len, cbi);
}
}
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
size_t buf_off = start - offset;
- typename SuperVector<S>::movemask_type mask = DOUBLE_LOAD_MASK(l) << buf_off;
- typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
- typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
- typename SuperVector<S>::movemask_type z = mask & (z1 << 1) & z2;
+ typename SuperVector<S>::comparemask_type mask =
+ DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
+ << (buf_off * SuperVector<S>::mask_width());
+ typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+ typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+ typename SuperVector<S>::comparemask_type z =
+ mask & (z1 << SuperVector<S>::mask_width()) & z2;
+ z = SuperVector<S>::iteration_mask(z);
return double_zscan(n, d, buf, z, len, cbi);
}
__builtin_prefetch(base + 256);
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
- typename SuperVector<S>::movemask_type z = mask1.eqmask(v);
+ typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
+ z = SuperVector<S>::iteration_mask(z);
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
size_t start = offset + n->msk_len - n->key_offset;
- typename SuperVector<S>::movemask_type lastz1{0};
+ typename SuperVector<S>::comparemask_type lastz1{0};
const u8 *d = buf + start;
const u8 *e = buf + end;
__builtin_prefetch(base + 256);
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
- typename SuperVector<S>::movemask_type z1 = mask1.eqmask(v);
- typename SuperVector<S>::movemask_type z2 = mask2.eqmask(v);
- typename SuperVector<S>::movemask_type z = (z1 << 1 | lastz1) & z2;
- lastz1 = z1 >> Z_SHIFT;
+ typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
+ typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
+ typename SuperVector<S>::comparemask_type z =
+ (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+ lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
+ z = SuperVector<S>::iteration_mask(z);
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
SuperVector<16> shuffled = s.pshufb<true>(permute);
SuperVector<16> compared = shuffled & compare;
- u16 rv = ~compared.eqmask(shuffled);
+ u64a rv = (~compared.eqmask(shuffled)) & 0xffff;
+ if (SuperVector<16>::mask_width() != 1) {
+ u32 ans = 0;
+ for (u32 i = 0; i < 16; ++i) {
+ ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >>
+ (i * SuperVector<16>::mask_width() - i);
+ }
+ return ans;
+ }
return (u32)rv;
}
u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
SuperVector<32> shuffled = s.pshufb<true>(permute);
SuperVector<32> compared = shuffled & compare;
- u32 rv = ~compared.eqmask(shuffled);
+ // TODO(danlark1): Future ARM support might have a bug.
+ u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff;
return (u32)((rv >> 16) | (rv & 0xffffU));
}
u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
SuperVector<64> shuffled = s.pshufb<true>(permute);
SuperVector<64> compared = shuffled & compare;
+ // TODO(danlark1): Future ARM support might have a bug.
u64a rv = ~compared.eqmask(shuffled);
rv = rv >> 32 | rv;
return (u32)(((rv >> 16) | rv) & 0xffffU);
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
- typename SuperVector<16>::movemask_type z = mask.movemask();
- DEBUG_PRINTF("z %08x\n", z);
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- u32 pos = ctz32(z & 0xffff);
+ typename SuperVector<16>::comparemask_type z = mask.comparemask();
+ DEBUG_PRINTF("z %08llx\n", z);
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
- DEBUG_PRINTF("buf + pos %p\n", buf + pos);
+ DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
return buf + pos;
} else {
return NULL; // no match
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
- typename SuperVector<16>::movemask_type z = mask.movemask();
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- DEBUG_PRINTF("z %08x\n", z);
- u32 pos = clz32(z & 0xffff);
+ typename SuperVector<16>::comparemask_type z = mask.comparemask();
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ DEBUG_PRINTF("z %08llx\n", z);
+ u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
- assert(pos >= 16 && pos < 32);
- return buf + (31 - pos);
+ return buf + (15 - pos);
} else {
return NULL; // no match
}
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
- typename SuperVector<16>::movemask_type z = mask.movemask();
- DEBUG_PRINTF("z %08x\n", z);
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- u32 pos = ctz32(z & 0xffff);
+ typename SuperVector<16>::comparemask_type z = mask.comparemask();
+ DEBUG_PRINTF("z %08llx\n", z);
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
DEBUG_PRINTF("buf + pos %p\n", buf + pos);
uint32x4_t m = mask.u.u32x4[0];
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
- typename SuperVector<16>::movemask_type z = mask.movemask();
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- DEBUG_PRINTF("z %08x\n", z);
- u32 pos = clz32(z & 0xffff);
+ typename SuperVector<16>::comparemask_type z = mask.comparemask();
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ DEBUG_PRINTF("z %08llx\n", z);
+ u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
- assert(pos >= 16 && pos < 32);
- return buf + (31 - pos);
+ return buf + (15 - pos);
} else {
return NULL; // no match
}
/** \brief Return 1 if a and b are different otherwise 0 */
static really_inline int diff128(m128 a, m128 b) {
- int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
- return (-16 != res);
+ uint64_t res = vget_lane_u64(
+ (uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
+ return (~0ull != res);
}
static really_inline int isnonzero128(m128 a) {
}
static really_inline u32 movemask128(m128 a) {
- uint8x16_t input = vreinterpretq_u8_s32(a);
- uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
- uint32x4_t paired16 =
- vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
- uint64x2_t paired32 =
- vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
- uint8x16_t paired64 =
- vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
- return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+ static const uint8x16_t powers = {1, 2, 4, 8, 16, 32, 64, 128,
+ 1, 2, 4, 8, 16, 32, 64, 128};
+
+ // Compute the mask from the input
+ uint8x16_t mask = (uint8x16_t)vpaddlq_u32(
+ vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+ uint8x16_t mask1 = vextq_u8(mask, (uint8x16_t)zeroes128(), 7);
+ mask = vorrq_u8(mask, mask1);
+
+ // Get the resulting bytes
+ uint16_t output;
+ vst1q_lane_u16((uint16_t *)&output, (uint16x8_t)mask, 0);
+ return output;
}
static really_inline m128 set1_16x8(u8 c) {
template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
- SuperVector<16>::movemask_type z = v.movemask();
+ SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
- SuperVector<16>::movemask_type z = v.movemask();
+ SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z)) {
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
- SuperVector<16>::movemask_type z = v.movemask();
+ SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z != 0xffff)) {
template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
- SuperVector<16>::movemask_type z = v.movemask();
+ SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
if (unlikely(z != 0xffff)) {
template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
- SuperVector<16>::movemask_type z = v.movemask();
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- DEBUG_PRINTF("z %08x\n", z);
+ assert(SuperVector<16>::mask_width() == 1);
+ SuperVector<16>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
- DEBUG_PRINTF("~z %08x\n", ~z);
+ DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
template <>
really_really_inline
const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
- SuperVector<32>::movemask_type z = v.movemask();
- DEBUG_PRINTF("z 0x%08x\n", z);
+ assert(SuperVector<32>::mask_width() == 1);
+ SuperVector<32>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("z 0x%08llx\n", z);
if (unlikely(z)) {
u32 pos = ctz32(z);
assert(pos < 32);
template <>
really_really_inline
const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
- SuperVector<64>::movemask_type z = v.movemask();
+ assert(SuperVector<64>::mask_width() == 1);
+ SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
- SuperVector<16>::movemask_type z = v.movemask();
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- DEBUG_PRINTF("z %08x\n", z);
+ assert(SuperVector<16>::mask_width() == 1);
+ SuperVector<16>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
template <>
really_really_inline
const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
- SuperVector<32>::movemask_type z = v.movemask();
- DEBUG_PRINTF("z 0x%08x\n", z);
+ assert(SuperVector<32>::mask_width() == 1);
+ SuperVector<32>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("z 0x%08llx\n", z);
if (unlikely(z)) {
u32 pos = clz32(z);
assert(pos < 32);
template <>
really_really_inline
const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
- SuperVector<64>::movemask_type z = v.movemask();
+ assert(SuperVector<64>::mask_width() == 1);
+ SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
- SuperVector<16>::movemask_type z = v.movemask();
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- DEBUG_PRINTF("z %08x\n", z);
+ assert(SuperVector<16>::mask_width() == 1);
+ SuperVector<16>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z != 0xffff)) {
u32 pos = ctz32(~z & 0xffff);
- DEBUG_PRINTF("~z %08x\n", ~z);
+ DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
template <>
really_really_inline
const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
- SuperVector<32>::movemask_type z = v.movemask();
- DEBUG_PRINTF("z 0x%08x\n", z);
+ assert(SuperVector<32>::mask_width() == 1);
+ SuperVector<32>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("z 0x%08llx\n", z);
if (unlikely(z != 0xffffffff)) {
- u32 pos = ctz32(~z);
+ u32 pos = ctz32(~z & 0xffffffffu);
assert(pos < 32);
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + pos;
template <>
really_really_inline
const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
- SuperVector<64>::movemask_type z = v.movemask();
+ assert(SuperVector<64>::mask_width() == 1);
+ SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
- SuperVector<16>::movemask_type z = v.movemask();
- DEBUG_PRINTF("buf %p z %08x \n", buf, z);
- DEBUG_PRINTF("z %08x\n", z);
+ assert(SuperVector<16>::mask_width() == 1);
+ SuperVector<16>::comparemask_type z = v.comparemask();
+ DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+ DEBUG_PRINTF("z %08llx\n", z);
if (unlikely(z != 0xffff)) {
- u32 pos = clz32(~z & 0xffff);
- DEBUG_PRINTF("~z %08x\n", ~z);
+ u32 pos = clz32(~z & 0xffffu);
+ DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
template<>
really_really_inline
const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
- SuperVector<32>::movemask_type z = v.movemask();
- if (unlikely(z != 0xffffffff)) {
- u32 pos = clz32(~z & 0xffffffff);
+ assert(SuperVector<32>::mask_width() == 1);
+ SuperVector<32>::comparemask_type z = v.comparemask();
+ if (unlikely(static_cast<u32>(z) != 0xffffffff)) {
+ u32 pos = clz32(~z & 0xffffffffu);
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
assert(pos < 32);
return buf + (31 - pos);
template <>
really_really_inline
const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
+ assert(SuperVector<64>::mask_width() == 1);
v.print8("v");
- SuperVector<64>::movemask_type z = v.movemask();
+ SuperVector<64>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
}
template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
-{
- SuperVector powers = SuperVector::dup_u64(0x8040201008040201UL);
-
- // Compute the mask from the input
- uint8x16_t mask = (uint8x16_t) vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(u.u8x16[0], powers.u.u8x16[0]))));
- uint64x2_t mask1 = (uint64x2_t) vextq_u8(mask, vdupq_n_u8(0), 7);
- mask = vorrq_u8(mask, (uint8x16_t) mask1);
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+ return static_cast<typename SuperVector<16>::comparemask_type>(
+ vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
+}
- // Get the resulting bytes
- uint16_t output;
- vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
- return static_cast<typename SuperVector<16>::movemask_type>(output);
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+ return eq(b).comparemask();
}
+template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
+
template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
- return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+ typename SuperVector<16>::comparemask_type mask) {
+ return mask & 0x1111111111111111ull;
}
template <>
}
template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
-{
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
}
template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
- return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+ return eq(b).comparemask();
}
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+ typename SuperVector<16>::comparemask_type mask) {
+ return mask;
+}
template <>
template<uint8_t N>
}
template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void)const
-{
- return _mm_movemask_epi8(u.v128[0]);
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+ return (u32)_mm_movemask_epi8(u.v128[0]);
}
template <>
-really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
-{
- return eq(b).movemask();
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+ return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+ typename SuperVector<16>::comparemask_type mask) {
+ return mask;
}
// template <>
}
template <>
-really_inline typename SuperVector<32>::movemask_type SuperVector<32>::movemask(void)const
-{
- return _mm256_movemask_epi8(u.v256[0]);
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::comparemask(void) const {
+ return (u32)_mm256_movemask_epi8(u.v256[0]);
}
template <>
-really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(SuperVector<32> const b) const
-{
- return eq(b).movemask();
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::eqmask(SuperVector<32> const b) const {
+ return eq(b).comparemask();
}
+template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::iteration_mask(
+ typename SuperVector<32>::comparemask_type mask) {
+ return mask;
+}
// template <>
// template<uint8_t N>
template <>
really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
{
- SuperVector<64>::movemask_type mask = _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
+ SuperVector<64>::comparemask_type mask =
+ _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
{
- SuperVector<64>::movemask_type mask = _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
+ SuperVector<64>::comparemask_type mask =
+ _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
{
- SuperVector<64>::movemask_type mask = _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
+ SuperVector<64>::comparemask_type mask =
+ _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
{
- SuperVector<64>::movemask_type mask = _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
+ SuperVector<64>::comparemask_type mask =
+ _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
{
- SuperVector<64>::movemask_type mask = _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
+ SuperVector<64>::comparemask_type mask =
+ _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
template <>
really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
{
- SuperVector<64>::movemask_type mask = _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
+ SuperVector<64>::comparemask_type mask =
+ _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
return {_mm512_movm_epi8(mask)};
}
}
template <>
-really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(void)const
-{
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::comparemask(void) const {
__m512i msb = _mm512_set1_epi8(0xFF);
__m512i mask = _mm512_and_si512(msb, u.v512[0]);
return _mm512_cmpeq_epi8_mask(mask, msb);
}
template <>
-really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
-{
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::eqmask(SuperVector<64> const b) const {
return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
}
+template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::iteration_mask(
+ typename SuperVector<64>::comparemask_type mask) {
+ return mask;
+}
+
// template <>
// template<uint8_t N>
// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
using Z_TYPE = u64a;
#define Z_BITS 64
#define Z_SHIFT 63
+#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#elif defined(HAVE_SIMD_256_BITS)
using Z_TYPE = u32;
#define Z_BITS 32
#define Z_SHIFT 31
+#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#elif defined(HAVE_SIMD_128_BITS)
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+using Z_TYPE = u64a;
+#define Z_BITS 64
+#define Z_POSSHIFT 2
+#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
+#else
using Z_TYPE = u32;
#define Z_BITS 32
+#define Z_POSSHIFT 0
+#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
+#endif
#define Z_SHIFT 15
-#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#endif
static constexpr bool is_valid = false;
static constexpr u16 size = 8;
using type = void;
- using movemask_type = void;
+ using comparemask_type = void;
+ using cmpmask_type = void;
static constexpr bool has_previous = false;
using previous_type = void;
static constexpr u16 previous_size = 4;
static constexpr bool is_valid = true;
static constexpr u16 size = 128;
using type = void;
- using movemask_type = u64a;
+ using comparemask_type = u64a;
static constexpr bool has_previous = true;
using previous_type = m512;
static constexpr u16 previous_size = 64;
static constexpr bool is_valid = true;
static constexpr u16 size = 64;
using type = m512;
- using movemask_type = u64a;
+ using comparemask_type = u64a;
static constexpr bool has_previous = true;
using previous_type = m256;
static constexpr u16 previous_size = 32;
static constexpr bool is_valid = true;
static constexpr u16 size = 32;
using type = m256;
- using movemask_type = u32;
+ using comparemask_type = u64a;
static constexpr bool has_previous = true;
using previous_type = m128;
static constexpr u16 previous_size = 16;
static constexpr bool is_valid = true;
static constexpr u16 size = 16;
using type = m128;
- using movemask_type = u32;
+ using comparemask_type = u64a;
static constexpr bool has_previous = false;
using previous_type = u64a;
static constexpr u16 previous_size = 8;
SuperVector eq(SuperVector const &b) const;
SuperVector operator<<(uint8_t const N) const;
SuperVector operator>>(uint8_t const N) const;
- typename base_type::movemask_type movemask(void) const;
- typename base_type::movemask_type eqmask(SuperVector const b) const;
+ // Returns mask_width groups of zeros or ones. To get the mask which can be
+ // iterated, use iteration_mask method, it ensures only one bit is set per
+ // mask_width group.
+ // Precondition: all bytes must be 0 or 0xff.
+ typename base_type::comparemask_type comparemask(void) const;
+ typename base_type::comparemask_type eqmask(SuperVector const b) const;
+ static u32 mask_width();
+ // Returns a mask with at most 1 bit set to 1. It can be used to iterate
+ // over bits through ctz/clz and lowest bit clear.
+ static typename base_type::comparemask_type
+ iteration_mask(typename base_type::comparemask_type mask);
static SuperVector loadu(void const *ptr);
static SuperVector load(void const *ptr);
}
}
auto SP = SuperVector<16>::loadu(vec);
- u16 mask = SP.movemask();
- for(int i=0; i<16; i++) {
- if (mask & (1 << i)) {
+ u64a mask = SP.comparemask();
+ for (int i = 0; i < 16; i++) {
+ if (mask & (1ull << (i * SuperVector<16>::mask_width()))) {
vec2[i] = 0xff;
}
}
for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;}
auto SP = SuperVector<16>::loadu(vec);
auto SP1 = SuperVector<16>::loadu(vec2);
- int mask = SP.eqmask(SP);
- ASSERT_EQ(mask,0xFFFF);
+ u64a mask = SP.eqmask(SP);
+ for (u32 i = 0; i < 16; ++i) {
+ ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+ }
mask = SP.eqmask(SP1);
ASSERT_EQ(mask,0);
vec2[0] = vec[0];
vec2[1] = vec[1];
auto SP2 = SuperVector<16>::loadu(vec2);
mask = SP.eqmask(SP2);
- ASSERT_EQ(mask,3);
+ ASSERT_TRUE(mask & 1);
+ ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width()));
+ for (u32 i = 2; i < 16; ++i) {
+ ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+ }
}
/*Define LSHIFT128 macro*/
}
}
auto SP = SuperVector<32>::loadu(vec);
- u32 mask = SP.movemask();
+ u64a mask = SP.comparemask();
for(int i=0; i<32; i++) {
- if (mask & (1 << i)) {
+ if (mask & (1ull << (i * SuperVector<32>::mask_width()))) {
vec2[i] = 0xff;
}
}
for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;}
auto SP = SuperVector<32>::loadu(vec);
auto SP1 = SuperVector<32>::loadu(vec2);
- u32 mask = SP.eqmask(SP);
- ASSERT_EQ(mask,0xFFFFFFFF);
+ u64a mask = SP.eqmask(SP);
+ for (u32 i = 0; i < 32; ++i) {
+ ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+ }
mask = SP.eqmask(SP1);
ASSERT_EQ(mask,0);
vec2[0] = vec[0];
vec2[1] = vec[1];
auto SP2 = SuperVector<32>::loadu(vec2);
mask = SP.eqmask(SP2);
- ASSERT_EQ(mask,3);
+ ASSERT_TRUE(mask & 1);
+ ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width()));
+ for (u32 i = 2; i < 32; ++i) {
+ ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+ }
}
TEST(SuperVectorUtilsTest,pshufb256c) {
auto SP = SuperVector<64>::loadu(vec);
auto SP1 = SuperVector<64>::loadu(vec2);
u64a mask = SP.eqmask(SP);
+ // Mask width for 64 bit type cannot be more than 1.
+ ASSERT_EQ(SuperVector<64>::mask_width(), 1);
ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF);
mask = SP.eqmask(SP1);
ASSERT_EQ(mask,0);