template <uint16_t S>
static really_inline
-const u8 *vermicelliBlock(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) {
+const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
- return first_non_zero_match<S>(buf, mask);
+ return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
-const u8 *vermicelliBlockNeg(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) {
+const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
- return first_zero_match_inverted<S>(buf, mask);
+ return first_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
-const u8 *rvermicelliBlock(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) {
+const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
SuperVector<S> mask = chars.eq(casemask & data);
- return last_non_zero_match<S>(buf, mask);
+ return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
-const u8 *rvermicelliBlockNeg(SuperVector<S> data, SuperVector<S> chars, SuperVector<S> casemask, const u8 *buf) {
+const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
+ data.print8("data");
+ chars.print8("chars");
+ casemask.print8("casemask");
SuperVector<S> mask = chars.eq(casemask & data);
- return last_zero_match_inverted<S>(buf, mask);
+ mask.print8("mask");
+ return last_zero_match_inverted<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
-const u8 *vermicelliDoubleBlock(SuperVector<S> data, SuperVector<S> chars1, SuperVector<S> chars2, SuperVector<S> casemask,
- u8 const c1, u8 const c2, u8 const casechar, const u8 *buf) {
+const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+ u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
- return first_non_zero_match<S>(buf, mask);
+ return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
-const u8 *rvermicelliDoubleBlock(SuperVector<S> data, SuperVector<S> chars1, SuperVector<S> chars2, SuperVector<S> casemask,
- u8 const c1, u8 const c2, u8 const casechar, const u8 *buf) {
+const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+ u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
SuperVector<S> v = casemask & data;
SuperVector<S> mask1 = chars1.eq(v);
mask = mask | (SuperVector<S>::Ones() >> (S-1));
}
- return last_non_zero_match<S>(buf, mask);
+ return last_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
static really_inline
-const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> data, SuperVector<S> chars1, SuperVector<S> chars2,
- SuperVector<S> mask1, SuperVector<S> mask2,
- u8 const c1, u8 const c2, u8 const m1, u8 const m2, const u8 *buf) {
+const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
+ SuperVector<S> const mask1, SuperVector<S> const mask2,
+ u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
SuperVector<S> v1 = chars1.eq(data & mask1);
SuperVector<S> v2 = chars2.eq(data & mask2);
DEBUG_PRINTF("partial = %d\n", partial_match);
if (partial_match) return buf - 1;
- return first_non_zero_match<S>(buf, mask);
+ return first_non_zero_match<S>(buf, mask, len);
}
template <uint16_t S>
-static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
+static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
assert(buf && buf_end);
assert(buf < buf_end);
DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
- rv = vermicelliBlock(data, chars, casemask, d);
+ rv = vermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv;
- d = ROUNDUP_PTR(d, S);
+ d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
- rv = vermicelliBlock(data, chars, casemask, d);
+ rv = vermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv;
d += S;
}
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
- rv = vermicelliBlock(data, chars, casemask, d);
+ rv = vermicelliBlock(data, chars, casemask, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
- rv = vermicelliBlockNeg(data, chars, casemask, d);
+ rv = vermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv;
- d = ROUNDUP_PTR(d, S);
+ d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
- rv = vermicelliBlockNeg(data, chars, casemask, d);
+ rv = vermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv;
d += S;
}
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
- rv = vermicelliBlockNeg(data, chars, casemask, d);
+ rv = vermicelliBlockNeg(data, chars, casemask, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S);
- rv = rvermicelliBlock(data, chars, casemask, d - S);
+ rv = rvermicelliBlock(data, chars, casemask, d - S, S);
DEBUG_PRINTF("rv %p \n", rv);
if (rv) return rv;
- d = ROUNDDOWN_PTR(d, S);
+ d = d1;
}
while (d - S >= buf) {
d -= S;
SuperVector<S> data = SuperVector<S>::load(d);
- rv = rvermicelliBlock(data, chars, casemask, d);
+ rv = rvermicelliBlock(data, chars, casemask, d, S);
if (rv) return rv;
}
}
if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf);
- rv = rvermicelliBlock(data, chars, casemask, buf);
+ rv = rvermicelliBlock(data, chars, casemask, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S);
- rv = rvermicelliBlockNeg(data, chars, casemask, d - S);
+ rv = rvermicelliBlockNeg(data, chars, casemask, d - S, S);
DEBUG_PRINTF("rv %p \n", rv);
if (rv) return rv;
- d = ROUNDDOWN_PTR(d, S);
+ d = d1;
}
while (d - S >= buf) {
d -= S;
SuperVector<S> data = SuperVector<S>::load(d);
- rv = rvermicelliBlockNeg(data, chars, casemask, d);
+ rv = rvermicelliBlockNeg(data, chars, casemask, d, S);
if (rv) return rv;
}
}
if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf);
- rv = rvermicelliBlockNeg(data, chars, casemask, buf);
+ rv = rvermicelliBlockNeg(data, chars, casemask, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
- rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);
+ rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv;
- d = ROUNDUP_PTR(d, S);
+ d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
- rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);
+ rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv;
d += S;
}
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
- rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);
+ rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDDOWN_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d - S);
- rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S);
+ rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S, S);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
- d = ROUNDDOWN_PTR(d, S);
+ d = d1;
}
while (d - S >= buf) {
d -= S;
SuperVector<S> data = SuperVector<S>::load(d);
- rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d);
+ rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
if (rv) return rv;
}
}
if (d != buf) {
SuperVector<S> data = SuperVector<S>::loadu(buf);
- rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf);
+ rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, buf, d - buf);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
+ u8 const *d1 = ROUNDUP_PTR(d, S);
SuperVector<S> data = SuperVector<S>::loadu(d);
- rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d);
+ rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S);
if (rv) return rv;
- d = ROUNDUP_PTR(d, S);
+ d = d1;
}
while(d + S <= buf_end) {
__builtin_prefetch(d + 64);
DEBUG_PRINTF("d %p \n", d);
SuperVector<S> data = SuperVector<S>::load(d);
- rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d);
+ rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, S);
if (rv) return rv;
d += S;
}
if (d != buf_end) {
SuperVector<S> data = SuperVector<S>::loadu_maskz(d, buf_end - d);
- rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d);
+ rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, buf_end - d);
DEBUG_PRINTF("rv %p \n", rv);
if (rv && rv < buf_end) return rv;
}
assert(buf < buf_end);
return vermicelliDoubleMaskedExecReal<VECTORSIZE>(c1, c2, m1, m2, buf, buf_end);
-}
\ No newline at end of file
+}
template <>
really_really_inline
-const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v) {
+const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
template <>
really_really_inline
-const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v) {
+const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z)) {
}
template <>
really_really_inline
-const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v) {
+const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
+ u64a mask = (~0ULL) >> (64 - len);
+ DEBUG_PRINTF("mask %016llx\n", mask);
+ z &= mask;
+ DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
template <>
really_really_inline
-const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v) {
+const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
template <>
really_really_inline
-const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v) {
+const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z)) {
}
template <>
really_really_inline
-const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v) {
+const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
+ u64a mask = (~0ULL) >> (64 - len);
+ DEBUG_PRINTF("mask %016llx\n", mask);
+ z &= mask;
+ DEBUG_PRINTF("z 0x%016llx\n", z);
if (unlikely(z)) {
u32 pos = clz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64);
- return buf + (31 - pos);
+ return buf + (63 - pos);
} else {
return NULL; // no match
}
template <>
really_really_inline
-const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v) {
+const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
template <>
really_really_inline
-const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v) {
+const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%08x\n", z);
if (unlikely(z != 0xffffffff)) {
}
template <>
really_really_inline
-const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v) {
+const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
- if (unlikely(z != ~0ULL)) {
- u32 pos = ctz64(~z);
+ u64a mask = (~0ULL) >> (64 - len);
+ DEBUG_PRINTF("mask %016llx\n", mask);
+ z = ~z & mask;
+ DEBUG_PRINTF("z 0x%016llx\n", z);
+ if (unlikely(z)) {
+ u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64);
return buf + pos;
template <>
really_really_inline
-const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v) {
+const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::movemask_type z = v.movemask();
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
DEBUG_PRINTF("z %08x\n", z);
template<>
really_really_inline
-const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v) {
+const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
SuperVector<32>::movemask_type z = v.movemask();
if (unlikely(z != 0xffffffff)) {
- u32 pos = clz32(~z);
+ u32 pos = clz32(~z & 0xffffffff);
DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
assert(pos < 32);
return buf + (31 - pos);
template <>
really_really_inline
-const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v) {
+const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
+ v.print8("v");
SuperVector<64>::movemask_type z = v.movemask();
DEBUG_PRINTF("z 0x%016llx\n", z);
- if (unlikely(z != ~0ULL)) {
- u32 pos = clz64(~z);
+ u64a mask = (~0ULL) >> (64 - len);
+ DEBUG_PRINTF("mask %016llx\n", mask);
+ z = ~z & mask;
+ DEBUG_PRINTF("z 0x%016llx\n", z);
+ if (unlikely(z)) {
+ u32 pos = clz64(z);
+ DEBUG_PRINTF("~z 0x%016llx\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 64);
return buf + (63 - pos);