const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
/* +1: the zones ensure that we can read the byte at z->end */
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
st0 = or128(st0, st4);
*s = or128(*s, st0);
- *conf0 = movq(*s);
+ *conf0 = movq(*s) ^ ~0ULL;
*s = rshiftbyte_m128(*s, 8);
- *conf0 ^= ~0ULL;
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st9 = load_m128_from_u64a(ft + reach9);
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st9 = load_m128_from_u64a(ft + reach9);
m128 st10 = load_m128_from_u64a(ft + reach10);
m128 st11 = load_m128_from_u64a(ft + reach11);
st8 = or128(st8, st12);
*s = or128(*s, st8);
- *conf8 = movq(*s);
+ *conf8 = movq(*s) ^ ~0ULL;
*s = rshiftbyte_m128(*s, 8);
- *conf8 ^= ~0ULL;
}
static really_inline
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
const u8 *tryFloodDetect = zz->floodPtr; \
const u8 *start_ptr = zz->start; \
const u8 *end_ptr = zz->end; \
+ for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \
+ itPtr += 4*ITER_BYTES) { \
+ __builtin_prefetch(itPtr); \
+ } \
\
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
itPtr += ITER_BYTES) { \
assert(ISALIGNED_N(q->state, 2));
u32 s = *(u16 *)q->state;
+ __builtin_prefetch(&m->remap[0]);
+ __builtin_prefetch(&m->remap[64]);
+ __builtin_prefetch(&m->remap[128]);
+ __builtin_prefetch(&m->remap[192]);
+
if (q->report_current) {
assert(s);
assert(get_aux(m, s)->accept);
u32 s = *(u8 *)q->state;
+ __builtin_prefetch(&m->remap[0]);
+ __builtin_prefetch(&m->remap[64]);
+ __builtin_prefetch(&m->remap[128]);
+ __builtin_prefetch(&m->remap[192]);
+
if (q->report_current) {
assert(s);
assert(s >= m->accept_limit_8);
return MO_ALIVE;
}
+ __builtin_prefetch(&m->remap[0]);
+ __builtin_prefetch(&m->remap[64]);
+ __builtin_prefetch(&m->remap[128]);
+ __builtin_prefetch(&m->remap[192]);
+
while (1) {
assert(q->cur < q->end);
s64a ep = q->items[q->cur].location;
return MO_ALIVE;
}
+ __builtin_prefetch(&m->remap[0]);
+ __builtin_prefetch(&m->remap[64]);
+ __builtin_prefetch(&m->remap[128]);
+ __builtin_prefetch(&m->remap[192]);
+
while (1) {
DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
q->items[q->cur].type == MQE_END ? "END" : "???",
#endif
#define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) and128(rshift64_m128(chars, 4), low4bits)
+//#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
static really_inline
u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
// Reroll FTW.
const u8 *last_block = buf_end - 16;
+
+ for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) {
+ __builtin_prefetch(itPtr);
+ }
while (buf < last_block) {
m128 lchars = load128(buf);
rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);