/*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
static
char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux,
- ReportID report) {
+ ReportID report) {
assert(m && aux);
if (!aux->accept) {
*(u16 *)dest = unaligned_load_u16(src);
return 0;
}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+const struct mstate_aux *get_aux64(const struct mcsheng64 *m, u32 s) {
+ const char *nfa = (const char *)m - sizeof(struct NFA);
+ const struct mstate_aux *aux
+ = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+ assert(ISALIGNED(aux));
+ return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts64(const struct mcsheng64 *m, u32 s) {
+ const struct mstate_aux *aux = get_aux64(m, s);
+
+ DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+ return aux->top;
+}
+
+static really_inline
+char doComplexReport64(NfaCallback cb, void *ctxt, const struct mcsheng64 *m,
+ u32 s, u64a loc, char eod, u32 *cached_accept_state,
+ u32 *cached_accept_id) {
+ DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+ s & STATE_MASK, loc, eod);
+
+ if (!eod && s == *cached_accept_state) {
+ if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING; /* termination requested */
+ }
+
+ return MO_CONTINUE_MATCHING; /* continue execution */
+ }
+
+ const struct mstate_aux *aux = get_aux64(m, s);
+ size_t offset = eod ? aux->accept_eod : aux->accept;
+
+ assert(offset);
+ const struct report_list *rl
+ = (const void *)((const char *)m + offset - sizeof(struct NFA));
+ assert(ISALIGNED(rl));
+
+ DEBUG_PRINTF("report list size %u\n", rl->count);
+ u32 count = rl->count;
+
+ if (!eod && count == 1) {
+ *cached_accept_state = s;
+ *cached_accept_id = rl->report[0];
+
+ DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+ if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING; /* termination requested */
+ }
+
+ return MO_CONTINUE_MATCHING; /* continue execution */
+ }
+
+ for (u32 i = 0; i < count; i++) {
+ DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+ if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING; /* termination requested */
+ }
+ }
+
+ return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+u32 doSheng64(const struct mcsheng64 *m, const u8 **c_inout, const u8 *soft_c_end,
+ const u8 *hard_c_end, u32 s_in, char do_accel) {
+ assert(s_in < m->sheng_end);
+ assert(s_in); /* should not already be dead */
+ assert(soft_c_end <= hard_c_end);
+ DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+ m512 s = set64x8(s_in - 1);
+ const u8 *c = *c_inout;
+ const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+ if (!do_accel) {
+ c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+ }
+
+ const m512 *masks = m->sheng_succ_masks;
+ u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+ u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+ /* When we use movd to get a u32 containing our state, it will have 4 lanes
+ * all duplicating the state. We can create versions of our limits with 4
+ * copies to directly compare against, this prevents us generating code to
+ * extract a single copy of the state from the u32 for checking. */
+ u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+ u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+ m512 simd_stop_limit = set16x32(sheng_stop_limit_x4);
+ m512 accel_delta = set64x8(sheng_limit - sheng_stop_limit);
+ DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
+ m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG64_SINGLE_ITER do { \
+ m512 succ_mask = masks[*(c++)]; \
+ s = vpermb512(s, succ_mask); \
+ u32 s_gpr_x4 = movd512(s); /* convert to u8 */ \
+ DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4); \
+ if (s_gpr_x4 >= sheng_stop_limit_x4) { \
+ s_gpr = s_gpr_x4; \
+ goto exit; \
+ } \
+ } while (0)
+
+ u8 s_gpr;
+ while (c < c_end) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+ /* This version uses pext for efficiently bitbashing out scaled
+ * versions of the bytes to process from a u64a */
+
+ u64a data_bytes = unaligned_load_u64a(c);
+ u64a cc0 = pdep64(data_bytes, 0x3fc0); /* extract scaled low byte */
+ data_bytes &= ~0xffULL; /* clear low bits for scale space */
+
+ m512 succ_mask0 = load512((const char *)masks + cc0);
+ s = vpermb512(s, succ_mask0);
+ m512 s_max = s;
+ m512 s_max0 = s_max;
+ DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 6, movd512(s));
+
+#define SHENG64_SINGLE_UNROLL_ITER(iter) \
+ assert(iter); \
+ u64a cc##iter = pext64(data_bytes, mcsheng64_pext_mask[iter]); \
+ assert(cc##iter == (u64a)c[iter] << 6); \
+ m512 succ_mask##iter = load512((const char *)masks + cc##iter); \
+ s = vpermb512(s, succ_mask##iter); \
+ if (do_accel && iter == 7) { \
+ /* in the final iteration we also have to check against accel */ \
+ m512 s_temp = sadd_u8_m512(s, accel_delta); \
+ s_max = max_u8_m512(s_max, s_temp); \
+ } else { \
+ s_max = max_u8_m512(s_max, s); \
+ } \
+ m512 s_max##iter = s_max; \
+ DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 6, \
+ movd512(s), movd512(s_max));
+
+ SHENG64_SINGLE_UNROLL_ITER(1);
+ SHENG64_SINGLE_UNROLL_ITER(2);
+ SHENG64_SINGLE_UNROLL_ITER(3);
+ SHENG64_SINGLE_UNROLL_ITER(4);
+ SHENG64_SINGLE_UNROLL_ITER(5);
+ SHENG64_SINGLE_UNROLL_ITER(6);
+ SHENG64_SINGLE_UNROLL_ITER(7);
+
+ if (movd512(s_max7) >= sheng_limit_x4) {
+ DEBUG_PRINTF("exit found\n");
+
+ /* Explicitly check the last byte as it is more likely as it also
+ * checks for acceleration. */
+ if (movd512(s_max6) < sheng_limit_x4) {
+ c += SHENG_CHUNK;
+ s_gpr = movq512(s);
+ assert(s_gpr >= sheng_stop_limit);
+ goto exit;
+ }
+
+ /* use shift-xor to create a register containing all of the max
+ * values */
+ m512 blended = rshift64_m512(s_max0, 56);
+ blended = xor512(blended, rshift64_m512(s_max1, 48));
+ blended = xor512(blended, rshift64_m512(s_max2, 40));
+ blended = xor512(blended, rshift64_m512(s_max3, 32));
+ blended = xor512(blended, rshift64_m512(s_max4, 24));
+ blended = xor512(blended, rshift64_m512(s_max5, 16));
+ blended = xor512(blended, rshift64_m512(s_max6, 8));
+ blended = xor512(blended, s);
+ blended = xor512(blended, rshift64_m512(blended, 8));
+ DEBUG_PRINTF("blended %016llx\n", movq512(blended));
+
+ m512 final = min_u8_m512(blended, simd_stop_limit);
+ m512 cmp = sub_u8_m512(final, simd_stop_limit);
+ m128 tmp = cast512to128(cmp);
+ u64a stops = ~movemask128(tmp);
+ assert(stops);
+ u32 earliest = ctz32(stops);
+ DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+ assert(earliest < 8);
+ c += earliest + 1;
+ s_gpr = movq512(blended) >> (earliest * 8);
+ assert(s_gpr >= sheng_stop_limit);
+ goto exit;
+ } else {
+ c += SHENG_CHUNK;
+ }
+#else
+ SHENG64_SINGLE_ITER;
+ SHENG64_SINGLE_ITER;
+ SHENG64_SINGLE_ITER;
+ SHENG64_SINGLE_ITER;
+
+ SHENG64_SINGLE_ITER;
+ SHENG64_SINGLE_ITER;
+ SHENG64_SINGLE_ITER;
+ SHENG64_SINGLE_ITER;
+#endif
+ }
+
+ assert(c_end - c < SHENG_CHUNK);
+ if (c < soft_c_end) {
+ assert(soft_c_end - c < SHENG_CHUNK);
+ switch (soft_c_end - c) {
+ case 7:
+ SHENG64_SINGLE_ITER; // fallthrough
+ case 6:
+ SHENG64_SINGLE_ITER; // fallthrough
+ case 5:
+ SHENG64_SINGLE_ITER; // fallthrough
+ case 4:
+ SHENG64_SINGLE_ITER; // fallthrough
+ case 3:
+ SHENG64_SINGLE_ITER; // fallthrough
+ case 2:
+ SHENG64_SINGLE_ITER; // fallthrough
+ case 1:
+ SHENG64_SINGLE_ITER; // fallthrough
+ }
+ }
+
+ assert(c >= soft_c_end);
+
+ s_gpr = movq512(s);
+exit:
+ assert(c <= hard_c_end);
+ DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+ assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+ /* undo state adjustment to match mcclellan view */
+ if (s_gpr == sheng_limit) {
+ s_gpr = 0;
+ } else if (s_gpr < sheng_limit) {
+ s_gpr++;
+ }
+
+ *c_inout = c;
+ return s_gpr;
+}
+
+static really_inline
+const char *findShermanState64(UNUSED const struct mcsheng64 *m,
+ const char *sherman_base_offset,
+ u32 sherman_base, u32 s) {
+ const char *rv
+ = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+ assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+ UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+ assert(type == SHERMAN_STATE);
+ return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel64(const struct mcsheng64 *m,
+ const struct mstate_aux *aux, u32 s,
+ const u8 **min_accel_offset,
+ const u8 *c, const u8 *c_end) {
+ DEBUG_PRINTF("skipping\n");
+ u32 accel_offset = aux[s].accel_offset;
+
+ assert(aux[s].accel_offset);
+ assert(accel_offset >= m->aux_offset);
+ assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+ const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+ const u8 *c2 = run_accel(aaux, c, c_end);
+
+ if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+ *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+ } else {
+ *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+ }
+
+ if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+ *min_accel_offset = c_end;
+ }
+
+ DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+ c2 - c, *min_accel_offset - c2, c_end - c2);
+
+ return c2;
+}
+
+static really_inline
+u32 doNormal64_16(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end,
+ u32 s, char do_accel, enum MatchMode mode) {
+ const u8 *c = *c_inout;
+ const u16 *succ_table
+ = (const u16 *)((const char *)m + sizeof(struct mcsheng64));
+ assert(ISALIGNED_N(succ_table, 2));
+ u32 sheng_end = m->sheng_end;
+ u32 sherman_base = m->sherman_limit;
+ const char *sherman_base_offset
+ = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+ u32 as = m->alphaShift;
+
+ /* Adjust start of succ table so we can index into using state id (rather
+ * than adjust to normal id). As we will not be processing states with low
+ * state ids, we will not be accessing data before the succ table. Note: due
+ * to the size of the sheng tables, the succ_table pointer will still be
+ * inside the engine.*/
+ succ_table -= sheng_end << as;
+ s &= STATE_MASK;
+ while (c < end && s >= sheng_end) {
+ u8 cprime = m->remap[*c];
+ DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+ ourisprint(*c) ? *c : '?', cprime, s);
+ if (s < sherman_base) {
+ DEBUG_PRINTF("doing normal\n");
+ assert(s < m->state_count);
+ s = succ_table[(s << as) + cprime];
+ } else {
+ const char *sherman_state
+ = findShermanState64(m, sherman_base_offset, sherman_base, s);
+ DEBUG_PRINTF("doing sherman (%u)\n", s);
+ s = doSherman16(sherman_state, cprime, succ_table, as);
+ }
+
+ DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+ c++;
+
+ if (do_accel && (s & ACCEL_FLAG)) {
+ break;
+ }
+ if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+ break;
+ }
+
+ s &= STATE_MASK;
+ }
+
+ *c_inout = c;
+ return s;
+}
+
+static really_inline
+char mcsheng64Exec16_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **c_final, enum MatchMode mode) {
+ assert(ISALIGNED_N(state, 2));
+ if (!len) {
+ if (mode == STOP_AT_MATCH) {
+ *c_final = buf;
+ }
+ return MO_ALIVE;
+ }
+
+ u32 s = *state;
+ const u8 *c = buf;
+ const u8 *c_end = buf + len;
+ const u8 sheng_end = m->sheng_end;
+ const struct mstate_aux *aux
+ = (const struct mstate_aux *)((const char *)m + m->aux_offset
+ - sizeof(struct NFA));
+
+ s &= STATE_MASK;
+
+ u32 cached_accept_id = 0;
+ u32 cached_accept_state = 0;
+
+ DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+ const u8 *min_accel_offset = c;
+ if (!m->has_accel || len < ACCEL_MIN_LEN) {
+ min_accel_offset = c_end;
+ goto without_accel;
+ }
+
+ goto with_accel;
+
+without_accel:
+ do {
+ assert(c < min_accel_offset);
+ int do_accept;
+ if (!s) {
+ goto exit;
+ } else if (s < sheng_end) {
+ s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+ do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+ } else {
+ s = doNormal64_16(m, &c, min_accel_offset, s, 0, mode);
+
+ do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+ }
+
+ if (do_accept) {
+ if (mode == STOP_AT_MATCH) {
+ *state = s & STATE_MASK;
+ *c_final = c - 1;
+ return MO_MATCHES_PENDING;
+ }
+
+ u64a loc = (c - 1) - buf + offAdj + 1;
+
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+ return MO_DEAD; /* termination requested */
+ }
+ } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+ &cached_accept_state,
+ &cached_accept_id)
+ == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ }
+
+ assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+ } while (c < min_accel_offset);
+
+ if (c == c_end) {
+ goto exit;
+ }
+
+with_accel:
+ do {
+ assert(c < c_end);
+ int do_accept;
+
+ if (!s) {
+ goto exit;
+ } else if (s < sheng_end) {
+ if (s > m->sheng_accel_limit) {
+ c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+ if (c == c_end) {
+ goto exit;
+ } else {
+ goto without_accel;
+ }
+ }
+ s = doSheng64(m, &c, c_end, c_end, s, 1);
+ do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+ } else {
+ if (s & ACCEL_FLAG) {
+ DEBUG_PRINTF("skipping\n");
+ s &= STATE_MASK;
+ c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+ if (c == c_end) {
+ goto exit;
+ } else {
+ goto without_accel;
+ }
+ }
+
+ s = doNormal64_16(m, &c, c_end, s, 1, mode);
+ do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+ }
+
+ if (do_accept) {
+ if (mode == STOP_AT_MATCH) {
+ *state = s & STATE_MASK;
+ *c_final = c - 1;
+ return MO_MATCHES_PENDING;
+ }
+
+ u64a loc = (c - 1) - buf + offAdj + 1;
+
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+ return MO_DEAD; /* termination requested */
+ }
+ } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+ &cached_accept_state,
+ &cached_accept_id)
+ == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ }
+
+ assert(c <= c_end);
+ } while (c < c_end);
+
+exit:
+ s &= STATE_MASK;
+
+ if (mode == STOP_AT_MATCH) {
+ *c_final = c_end;
+ }
+ *state = s;
+
+ return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec16_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point) {
+ return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec16_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point) {
+ return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec16_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point) {
+ return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec16_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point,
+ enum MatchMode mode) {
+ if (mode == CALLBACK_OUTPUT) {
+ return mcsheng64Exec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+ single, final_point);
+ } else if (mode == STOP_AT_MATCH) {
+ return mcsheng64Exec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+ single, final_point);
+ } else {
+ assert (mode == NO_MATCHES);
+ return mcsheng64Exec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+ single, final_point);
+ }
+}
+
+static really_inline
+u32 doNormal64_8(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, u32 s,
+ char do_accel, enum MatchMode mode) {
+ const u8 *c = *c_inout;
+ u32 sheng_end = m->sheng_end;
+ u32 accel_limit = m->accel_limit_8;
+ u32 accept_limit = m->accept_limit_8;
+
+ const u32 as = m->alphaShift;
+ const u8 *succ_table = (const u8 *)((const char *)m
+ + sizeof(struct mcsheng64));
+ /* Adjust start of succ table so we can index into using state id (rather
+ * than adjust to normal id). As we will not be processing states with low
+ * state ids, we will not be accessing data before the succ table. Note: due
+ * to the size of the sheng tables, the succ_table pointer will still be
+ * inside the engine.*/
+ succ_table -= sheng_end << as;
+
+ assert(s >= sheng_end);
+ while (c < end && s >= sheng_end) {
+ u8 cprime = m->remap[*c];
+ DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+ ourisprint(*c) ? *c : '?', cprime);
+ s = succ_table[(s << as) + cprime];
+
+ DEBUG_PRINTF("s: %u\n", s);
+ c++;
+ if (do_accel) {
+ if (s >= accel_limit) {
+ break;
+ }
+ } else {
+ if (mode != NO_MATCHES && s >= accept_limit) {
+ break;
+ }
+ }
+ }
+ *c_inout = c;
+ return s;
+}
+
+static really_inline
+char mcsheng64Exec8_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **c_final, enum MatchMode mode) {
+ if (!len) {
+ *c_final = buf;
+ return MO_ALIVE;
+ }
+ u32 s = *state;
+ const u8 *c = buf;
+ const u8 *c_end = buf + len;
+ const u8 sheng_end = m->sheng_end;
+
+ const struct mstate_aux *aux
+ = (const struct mstate_aux *)((const char *)m + m->aux_offset
+ - sizeof(struct NFA));
+ u32 accept_limit = m->accept_limit_8;
+
+ u32 cached_accept_id = 0;
+ u32 cached_accept_state = 0;
+
+ DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+ DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+ const u8 *min_accel_offset = c;
+ if (!m->has_accel || len < ACCEL_MIN_LEN) {
+ min_accel_offset = c_end;
+ goto without_accel;
+ }
+
+ goto with_accel;
+
+without_accel:
+ do {
+ assert(c < min_accel_offset);
+ if (!s) {
+ goto exit;
+ } else if (s < sheng_end) {
+ s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+ } else {
+ s = doNormal64_8(m, &c, min_accel_offset, s, 0, mode);
+ assert(c <= min_accel_offset);
+ }
+
+ if (mode != NO_MATCHES && s >= accept_limit) {
+ if (mode == STOP_AT_MATCH) {
+ DEBUG_PRINTF("match - pausing\n");
+ *state = s;
+ *c_final = c - 1;
+ return MO_MATCHES_PENDING;
+ }
+
+ u64a loc = (c - 1) - buf + offAdj + 1;
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+ &cached_accept_state,
+ &cached_accept_id)
+ == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ }
+
+ assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+ } while (c < min_accel_offset);
+
+ if (c == c_end) {
+ goto exit;
+ }
+
+with_accel:
+ do {
+ u32 accel_limit = m->accel_limit_8;
+
+ assert(c < c_end);
+ if (!s) {
+ goto exit;
+ } else if (s < sheng_end) {
+ if (s > m->sheng_accel_limit) {
+ c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+ if (c == c_end) {
+ goto exit;
+ } else {
+ goto without_accel;
+ }
+ }
+ s = doSheng64(m, &c, c_end, c_end, s, 1);
+ } else {
+ if (s >= accel_limit && aux[s].accel_offset) {
+ c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+ if (c == c_end) {
+ goto exit;
+ } else {
+ goto without_accel;
+ }
+ }
+ s = doNormal64_8(m, &c, c_end, s, 1, mode);
+ }
+
+ if (mode != NO_MATCHES && s >= accept_limit) {
+ if (mode == STOP_AT_MATCH) {
+ DEBUG_PRINTF("match - pausing\n");
+ *state = s;
+ *c_final = c - 1;
+ return MO_MATCHES_PENDING;
+ }
+
+ u64a loc = (c - 1) - buf + offAdj + 1;
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+ &cached_accept_state,
+ &cached_accept_id)
+ == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ }
+
+ assert(c <= c_end);
+ } while (c < c_end);
+
+exit:
+ *state = s;
+ if (mode == STOP_AT_MATCH) {
+ *c_final = c_end;
+ }
+ return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec8_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point) {
+ return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec8_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point) {
+ return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec8_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point) {
+ return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec8_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+ size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+ char single, const u8 **final_point,
+ enum MatchMode mode) {
+ if (mode == CALLBACK_OUTPUT) {
+ return mcsheng64Exec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point);
+ } else if (mode == STOP_AT_MATCH) {
+ return mcsheng64Exec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+ single, final_point);
+ } else {
+ assert(mode == NO_MATCHES);
+ return mcsheng64Exec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+ final_point);
+ }
+}
+
+static really_inline
+char mcshengCheckEOD64(const struct NFA *nfa, u32 s, u64a offset,
+ NfaCallback cb, void *ctxt) {
+ const struct mcsheng64 *m = getImplNfa(nfa);
+ const struct mstate_aux *aux = get_aux64(m, s);
+
+ if (!aux->accept_eod) {
+ return MO_CONTINUE_MATCHING;
+ }
+ return doComplexReport64(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng64_16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+ const u8 *hend, NfaCallback cb, void *context,
+ struct mq *q, char single, s64a end,
+ enum MatchMode mode) {
+ assert(n->type == MCSHENG_64_NFA_16);
+ const struct mcsheng64 *m = getImplNfa(n);
+ s64a sp;
+
+ assert(ISALIGNED_N(q->state, 2));
+ u32 s = *(u16 *)q->state;
+
+ if (q->report_current) {
+ assert(s);
+ assert(get_aux64(m, s)->accept);
+
+ int rv;
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ rv = cb(0, q_cur_offset(q), m->arb_report, context);
+ } else {
+ u32 cached_accept_id = 0;
+ u32 cached_accept_state = 0;
+
+ rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+ &cached_accept_state, &cached_accept_id);
+ }
+
+ q->report_current = 0;
+
+ if (rv == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ }
+
+ sp = q_cur_loc(q);
+ q->cur++;
+
+ const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+ assert(q->cur);
+ if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+ DEBUG_PRINTF("this is as far as we go\n");
+ q->cur--;
+ q->items[q->cur].type = MQE_START;
+ q->items[q->cur].location = end;
+ *(u16 *)q->state = s;
+ return MO_ALIVE;
+ }
+
+ while (1) {
+ assert(q->cur < q->end);
+ s64a ep = q->items[q->cur].location;
+ if (mode != NO_MATCHES) {
+ ep = MIN(ep, end);
+ }
+
+ assert(ep >= sp);
+
+ s64a local_ep = ep;
+ if (sp < 0) {
+ local_ep = MIN(0, ep);
+ }
+
+ /* do main buffer region */
+ const u8 *final_look;
+ char rv = mcsheng64Exec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+ offset + sp, cb, context, single,
+ &final_look, mode);
+ if (rv == MO_DEAD) {
+ *(u16 *)q->state = 0;
+ return MO_DEAD;
+ }
+ if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+ DEBUG_PRINTF("this is as far as we go\n");
+ DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+ assert(q->cur);
+ assert(final_look != cur_buf + local_ep);
+
+ q->cur--;
+ q->items[q->cur].type = MQE_START;
+ q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+ * early -1 */
+ *(u16 *)q->state = s;
+ return MO_MATCHES_PENDING;
+ }
+
+ assert(rv == MO_ALIVE);
+ assert(q->cur);
+ if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+ DEBUG_PRINTF("this is as far as we go\n");
+ q->cur--;
+ q->items[q->cur].type = MQE_START;
+ q->items[q->cur].location = end;
+ *(u16 *)q->state = s;
+ return MO_ALIVE;
+ }
+
+ sp = local_ep;
+
+ if (sp == 0) {
+ cur_buf = buffer;
+ }
+
+ if (sp != ep) {
+ continue;
+ }
+
+ switch (q->items[q->cur].type) {
+ case MQE_TOP:
+ assert(sp + offset || !s);
+ if (sp + offset == 0) {
+ s = m->start_anchored;
+ break;
+ }
+ s = mcshengEnableStarts64(m, s);
+ break;
+ case MQE_END:
+ *(u16 *)q->state = s;
+ q->cur++;
+ return s ? MO_ALIVE : MO_DEAD;
+ default:
+ assert(!"invalid queue event");
+ }
+
+ q->cur++;
+ }
+}
+
+static really_inline
+char nfaExecMcSheng64_8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+ const u8 *hend, NfaCallback cb, void *context,
+ struct mq *q, char single, s64a end,
+ enum MatchMode mode) {
+ assert(n->type == MCSHENG_64_NFA_8);
+ const struct mcsheng64 *m = getImplNfa(n);
+ s64a sp;
+
+ u32 s = *(u8 *)q->state;
+
+ if (q->report_current) {
+ assert(s);
+ assert(s >= m->accept_limit_8);
+
+ int rv;
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+
+ rv = cb(0, q_cur_offset(q), m->arb_report, context);
+ } else {
+ u32 cached_accept_id = 0;
+ u32 cached_accept_state = 0;
+
+ rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+ &cached_accept_state, &cached_accept_id);
+ }
+
+ q->report_current = 0;
+
+ if (rv == MO_HALT_MATCHING) {
+ return MO_DEAD;
+ }
+ }
+
+ sp = q_cur_loc(q);
+ q->cur++;
+
+ const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+ if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+ DEBUG_PRINTF("this is as far as we go\n");
+ q->cur--;
+ q->items[q->cur].type = MQE_START;
+ q->items[q->cur].location = end;
+ *(u8 *)q->state = s;
+ return MO_ALIVE;
+ }
+
+ while (1) {
+ DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+ q->items[q->cur].type == MQE_END ? "END" : "???",
+ q->items[q->cur].location + offset);
+ assert(q->cur < q->end);
+ s64a ep = q->items[q->cur].location;
+ if (mode != NO_MATCHES) {
+ ep = MIN(ep, end);
+ }
+
+ assert(ep >= sp);
+
+ s64a local_ep = ep;
+ if (sp < 0) {
+ local_ep = MIN(0, ep);
+ }
+
+ const u8 *final_look;
+ char rv = mcsheng64Exec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+ offset + sp, cb, context, single,
+ &final_look, mode);
+ if (rv == MO_HALT_MATCHING) {
+ *(u8 *)q->state = 0;
+ return MO_DEAD;
+ }
+ if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+ DEBUG_PRINTF("this is as far as we go\n");
+ DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+ assert(q->cur);
+ assert(final_look != cur_buf + local_ep);
+
+ q->cur--;
+ q->items[q->cur].type = MQE_START;
+ q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+ * early -1 */
+ *(u8 *)q->state = s;
+ return MO_MATCHES_PENDING;
+ }
+
+ assert(rv == MO_ALIVE);
+ assert(q->cur);
+ if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+ DEBUG_PRINTF("this is as far as we go\n");
+ assert(q->cur);
+ q->cur--;
+ q->items[q->cur].type = MQE_START;
+ q->items[q->cur].location = end;
+ *(u8 *)q->state = s;
+ return MO_ALIVE;
+ }
+
+ sp = local_ep;
+
+ if (sp == 0) {
+ cur_buf = buffer;
+ }
+
+ if (sp != ep) {
+ continue;
+ }
+
+ switch (q->items[q->cur].type) {
+ case MQE_TOP:
+ assert(sp + offset || !s);
+ if (sp + offset == 0) {
+ s = (u8)m->start_anchored;
+ break;
+ }
+ s = mcshengEnableStarts64(m, s);
+ break;
+ case MQE_END:
+ *(u8 *)q->state = s;
+ q->cur++;
+ return s ? MO_ALIVE : MO_DEAD;
+ default:
+ assert(!"invalid queue event");
+ }
+
+ q->cur++;
+ }
+}
+
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end) {
+ u64a offset = q->offset;
+ const u8 *buffer = q->buffer;
+ NfaCallback cb = q->cb;
+ void *context = q->context;
+ assert(n->type == MCSHENG_64_NFA_8);
+ const struct mcsheng64 *m = getImplNfa(n);
+ const u8 *hend = q->history + q->hlength;
+
+ return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+ m->flags & MCSHENG_FLAG_SINGLE, end,
+ CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end) {
+ u64a offset = q->offset;
+ const u8 *buffer = q->buffer;
+ NfaCallback cb = q->cb;
+ void *context = q->context;
+ assert(n->type == MCSHENG_64_NFA_16);
+ const struct mcsheng64 *m = getImplNfa(n);
+ const u8 *hend = q->history + q->hlength;
+
+ return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+ m->flags & MCSHENG_FLAG_SINGLE, end,
+ CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q) {
+ const struct mcsheng64 *m = getImplNfa(n);
+ NfaCallback cb = q->cb;
+ void *ctxt = q->context;
+ u32 s = *(u8 *)q->state;
+ u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+ u64a offset = q_cur_offset(q);
+ assert(q_cur_type(q) == MQE_START);
+ assert(s);
+
+ if (s >= m->accept_limit_8) {
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ cb(0, offset, m->arb_report, ctxt);
+ } else {
+ u32 cached_accept_id = 0;
+ u32 cached_accept_state = 0;
+
+ doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+ &cached_accept_id);
+ }
+ }
+
+ return 0;
+}
+
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q) {
+ const struct mcsheng64 *m = getImplNfa(n);
+ NfaCallback cb = q->cb;
+ void *ctxt = q->context;
+ u32 s = *(u16 *)q->state;
+ const struct mstate_aux *aux = get_aux64(m, s);
+ u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+ u64a offset = q_cur_offset(q);
+ assert(q_cur_type(q) == MQE_START);
+ DEBUG_PRINTF("state %u\n", s);
+ assert(s);
+
+ if (aux->accept) {
+ if (single) {
+ DEBUG_PRINTF("reporting %u\n", m->arb_report);
+ cb(0, offset, m->arb_report, ctxt);
+ } else {
+ u32 cached_accept_id = 0;
+ u32 cached_accept_state = 0;
+
+ doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+ &cached_accept_id);
+ }
+ }
+
+ return 0;
+}
+
+static
+char mcshengHasAccept64(const struct mcsheng64 *m, const struct mstate_aux *aux,
+ ReportID report) {
+ assert(m && aux);
+
+ if (!aux->accept) {
+ return 0;
+ }
+
+ const struct report_list *rl = (const struct report_list *)
+ ((const char *)m + aux->accept - sizeof(struct NFA));
+ assert(ISALIGNED_N(rl, 4));
+
+ DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+ for (u32 i = 0; i < rl->count; i++) {
+ if (rl->report[i] == report) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+ struct mq *q) {
+ assert(n && q);
+
+ const struct mcsheng64 *m = getImplNfa(n);
+ u8 s = *(u8 *)q->state;
+ DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+ return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q) {
+ assert(n && q);
+
+ const struct mcsheng64 *m = getImplNfa(n);
+ u8 s = *(u8 *)q->state;
+ DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+ return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+ struct mq *q) {
+ assert(n && q);
+
+ const struct mcsheng64 *m = getImplNfa(n);
+ u16 s = *(u16 *)q->state;
+ DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+ return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q) {
+ assert(n && q);
+
+ const struct mcsheng64 *m = getImplNfa(n);
+ u16 s = *(u16 *)q->state;
+ DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+ return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+ u64a offset = q->offset;
+ const u8 *buffer = q->buffer;
+ NfaCallback cb = q->cb;
+ void *context = q->context;
+ assert(n->type == MCSHENG_64_NFA_8);
+ const struct mcsheng64 *m = getImplNfa(n);
+ const u8 *hend = q->history + q->hlength;
+
+ return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+ m->flags & MCSHENG_FLAG_SINGLE, end,
+ STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+ u64a offset = q->offset;
+ const u8 *buffer = q->buffer;
+ NfaCallback cb = q->cb;
+ void *context = q->context;
+ assert(n->type == MCSHENG_64_NFA_16);
+ const struct mcsheng64 *m = getImplNfa(n);
+ const u8 *hend = q->history + q->hlength;
+
+ return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+ m->flags & MCSHENG_FLAG_SINGLE, end,
+ STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+ u64a offset = q->offset;
+ const u8 *buffer = q->buffer;
+ NfaCallback cb = q->cb;
+ void *context = q->context;
+ assert(n->type == MCSHENG_64_NFA_8);
+ const struct mcsheng64 *m = getImplNfa(n);
+ const u8 *hend = q->history + q->hlength;
+
+ char rv = nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+ m->flags & MCSHENG_FLAG_SINGLE,
+ 0 /* end */, NO_MATCHES);
+ if (rv && nfaExecMcSheng64_8_inAccept(n, report, q)) {
+ return MO_MATCHES_PENDING;
+ } else {
+ return rv;
+ }
+}
+
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+ u64a offset = q->offset;
+ const u8 *buffer = q->buffer;
+ NfaCallback cb = q->cb;
+ void *context = q->context;
+ assert(n->type == MCSHENG_64_NFA_16);
+ const struct mcsheng64 *m = getImplNfa(n);
+ const u8 *hend = q->history + q->hlength;
+
+ char rv = nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+ m->flags & MCSHENG_FLAG_SINGLE,
+ 0 /* end */, NO_MATCHES);
+
+ if (rv && nfaExecMcSheng64_16_inAccept(n, report, q)) {
+ return MO_MATCHES_PENDING;
+ } else {
+ return rv;
+ }
+}
+
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *nfa, u64a offset,
+ void *state, UNUSED u8 key) {
+ const struct mcsheng64 *m = getImplNfa(nfa);
+ u8 s = offset ? m->start_floating : m->start_anchored;
+ if (s) {
+ *(u8 *)state = s;
+ return 1;
+ }
+ return 0;
+}
+
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *nfa, u64a offset,
+ void *state, UNUSED u8 key) {
+ const struct mcsheng64 *m = getImplNfa(nfa);
+ u16 s = offset ? m->start_floating : m->start_anchored;
+ if (s) {
+ unaligned_store_u16(state, s);
+ return 1;
+ }
+ return 0;
+}
+
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+ UNUSED const char *streamState, u64a offset,
+ NfaCallback callback, void *context) {
+ return mcshengCheckEOD64(nfa, *(const u8 *)state, offset, callback,
+ context);
+}
+
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+ UNUSED const char *streamState, u64a offset,
+ NfaCallback callback, void *context) {
+ assert(ISALIGNED_N(state, 2));
+ return mcshengCheckEOD64(nfa, *(const u16 *)state, offset, callback,
+ context);
+}
+
+char nfaExecMcSheng64_8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+ assert(nfa->scratchStateSize == 1);
+ *(u8 *)q->state = 0;
+ return 0;
+}
+
+char nfaExecMcSheng64_16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+ assert(nfa->scratchStateSize == 2);
+ assert(ISALIGNED_N(q->state, 2));
+ *(u16 *)q->state = 0;
+ return 0;
+}
+
+char nfaExecMcSheng64_8_queueCompressState(UNUSED const struct NFA *nfa,
+ const struct mq *q, UNUSED s64a loc) {
+ void *dest = q->streamState;
+ const void *src = q->state;
+ assert(nfa->scratchStateSize == 1);
+ assert(nfa->streamStateSize == 1);
+ *(u8 *)dest = *(const u8 *)src;
+ return 0;
+}
+
+char nfaExecMcSheng64_8_expandState(UNUSED const struct NFA *nfa, void *dest,
+ const void *src, UNUSED u64a offset,
+ UNUSED u8 key) {
+ assert(nfa->scratchStateSize == 1);
+ assert(nfa->streamStateSize == 1);
+ *(u8 *)dest = *(const u8 *)src;
+ return 0;
+}
+
+char nfaExecMcSheng64_16_queueCompressState(UNUSED const struct NFA *nfa,
+ const struct mq *q,
+ UNUSED s64a loc) {
+ void *dest = q->streamState;
+ const void *src = q->state;
+ assert(nfa->scratchStateSize == 2);
+ assert(nfa->streamStateSize == 2);
+ assert(ISALIGNED_N(src, 2));
+ unaligned_store_u16(dest, *(const u16 *)(src));
+ return 0;
+}
+
+char nfaExecMcSheng64_16_expandState(UNUSED const struct NFA *nfa, void *dest,
+ const void *src, UNUSED u64a offset,
+ UNUSED u8 key) {
+ assert(nfa->scratchStateSize == 2);
+ assert(nfa->streamStateSize == 2);
+ assert(ISALIGNED_N(dest, 2));
+ *(u16 *)dest = unaligned_load_u16(src);
+ return 0;
+}
+#endif
#include <set>
#include <deque>
#include <vector>
-
#include <boost/range/adaptor/map.hpp>
using namespace std;
}
}
+#if defined(HAVE_AVX512VBMI)
+static
+mstate_aux *getAux64(NFA *n, dstate_id_t i) {
+ mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(n);
+ mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
+
+ mstate_aux *aux = aux_base + i;
+ assert((const char *)aux < (const char *)n + m->length);
+ return aux;
+}
+
+static
+void createShuffleMasks64(mcsheng64 *m, const dfa_info &info,
+ dstate_id_t sheng_end,
+ const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+ DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
+ assert(sheng_end > DEAD_STATE + 1);
+ assert(sheng_end <= sizeof(m512) + 1);
+ vector<array<u8, sizeof(m512)>> masks;
+ masks.resize(info.alpha_size);
+ /* -1 to avoid wasting a slot as we do not include dead state */
+ vector<dstate_id_t> raw_ids;
+ raw_ids.resize(sheng_end - 1);
+ for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
+ assert(info.implId(s)); /* should not map to DEAD_STATE */
+ if (info.is_sheng(s)) {
+ raw_ids[info.extra[s].sheng_id] = s;
+ }
+ }
+ for (u32 i = 0; i < info.alpha_size; i++) {
+ if (i == info.alpha_remap[TOP]) {
+ continue;
+ }
+ auto &mask = masks[i];
+ assert(sizeof(mask) == sizeof(m512));
+ mask.fill(0);
+
+ for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
+ dstate_id_t raw_id = raw_ids[sheng_id];
+ dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
+ if (next_id == DEAD_STATE) {
+ next_id = sheng_end - 1;
+ } else if (next_id < sheng_end) {
+ next_id--;
+ }
+ DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
+ mask[sheng_id] = verify_u8(next_id);
+ }
+ }
+ for (u32 i = 0; i < N_CHARS; i++) {
+ assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
+ memcpy((u8 *)&m->sheng_succ_masks[i],
+ (u8 *)masks[info.alpha_remap[i]].data(), sizeof(m512));
+ }
+ m->sheng_end = sheng_end;
+ m->sheng_accel_limit = sheng_end - 1;
+
+ for (dstate_id_t s : raw_ids) {
+ if (contains(accel_escape_info, s)) {
+ LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
+ }
+ }
+}
+
+static
+void populateBasicInfo64(size_t state_size, const dfa_info &info,
+ u32 total_size, u32 aux_offset, u32 accel_offset,
+ u32 accel_count, ReportID arb, bool single, NFA *nfa) {
+ assert(state_size == sizeof(u16) || state_size == sizeof(u8));
+
+ nfa->length = total_size;
+ nfa->nPositions = info.states.size();
+
+ nfa->scratchStateSize = verify_u32(state_size);
+ nfa->streamStateSize = verify_u32(state_size);
+
+ if (state_size == sizeof(u8)) {
+ nfa->type = MCSHENG_64_NFA_8;
+ } else {
+ nfa->type = MCSHENG_64_NFA_16;
+ }
+
+ mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+ for (u32 i = 0; i < 256; i++) {
+ m->remap[i] = verify_u8(info.alpha_remap[i]);
+ }
+ m->alphaShift = info.getAlphaShift();
+ m->length = total_size;
+ m->aux_offset = aux_offset;
+ m->accel_offset = accel_offset;
+ m->arb_report = arb;
+ m->state_count = verify_u16(info.size());
+ m->start_anchored = info.implId(info.raw.start_anchored);
+ m->start_floating = info.implId(info.raw.start_floating);
+ m->has_accel = accel_count ? 1 : 0;
+
+ if (single) {
+ m->flags |= MCSHENG_FLAG_SINGLE;
+ }
+}
+#endif
+
static
size_t calcShermanRegionSize(const dfa_info &info) {
size_t rv = 0;
/* returns false on error */
static
bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
- dstate_id_t *sherman_base) {
+ dstate_id_t *sherman_base) {
info.states[0].impl_id = 0; /* dead is always 0 */
vector<dstate_id_t> norm;
}
#define MAX_SHENG_STATES 16
+#define MAX_SHENG64_STATES 64
#define MAX_SHENG_LEAKINESS 0.05
using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
static
dstate_id_t find_sheng_states(dfa_info &info,
- map<dstate_id_t, AccelScheme> &accel_escape_info) {
+ map<dstate_id_t, AccelScheme> &accel_escape_info,
+ size_t max_sheng_states) {
RdfaGraph g(info.raw);
auto cyclics = find_vertices_in_cycles(g);
flat_set<dstate_id_t> considered = { DEAD_STATE };
bool seen_back_edge = false;
while (!to_consider.empty()
- && sheng_states.size() < MAX_SHENG_STATES) {
+ && sheng_states.size() < max_sheng_states) {
auto v = to_consider.front();
to_consider.pop_front();
if (!considered.insert(g[v].index).second) {
}
}
+#if defined(HAVE_AVX512VBMI)
+static
+void fill_in_aux_info64(NFA *nfa, const dfa_info &info,
+ const map<dstate_id_t, AccelScheme> &accel_escape_info,
+ u32 accel_offset, UNUSED u32 accel_end_offset,
+ const vector<u32> &reports,
+ const vector<u32> &reports_eod,
+ u32 report_base_offset,
+ const raw_report_info &ri) {
+ mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+
+ vector<u32> reportOffsets;
+
+ ri.fillReportLists(nfa, report_base_offset, reportOffsets);
+
+ for (u32 i = 0; i < info.size(); i++) {
+ u16 impl_id = info.implId(i);
+ mstate_aux *this_aux = getAux64(nfa, impl_id);
+
+ fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
+ if (contains(accel_escape_info, i)) {
+ this_aux->accel_offset = accel_offset;
+ accel_offset += info.strat.accelSize();
+ assert(accel_offset <= accel_end_offset);
+ assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+ info.strat.buildAccel(i, accel_escape_info.at(i),
+ (void *)((char *)m + this_aux->accel_offset));
+ }
+ }
+}
+
+static
+u16 get_edge_flags64(NFA *nfa, dstate_id_t target_impl_id) {
+ mstate_aux *aux = getAux64(nfa, target_impl_id);
+ u16 flags = 0;
+
+ if (aux->accept) {
+ flags |= ACCEPT_FLAG;
+ }
+
+ if (aux->accel_offset) {
+ flags |= ACCEL_FLAG;
+ }
+
+ return flags;
+}
+
+static
+void fill_in_succ_table_64_16(NFA *nfa, const dfa_info &info,
+ dstate_id_t sheng_end,
+ UNUSED dstate_id_t sherman_base) {
+ u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng64));
+
+ u8 alphaShift = info.getAlphaShift();
+ assert(alphaShift <= 8);
+
+ for (size_t i = 0; i < info.size(); i++) {
+ if (!info.is_normal(i)) {
+ assert(info.implId(i) < sheng_end || info.is_sherman(i));
+ continue;
+ }
+
+ assert(info.implId(i) < sherman_base);
+ u16 normal_id = verify_u16(info.implId(i) - sheng_end);
+
+ for (size_t s = 0; s < info.impl_alpha_size; s++) {
+ dstate_id_t raw_succ = info.states[i].next[s];
+ u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s];
+
+ entry = info.implId(raw_succ);
+ entry |= get_edge_flags64(nfa, entry);
+ }
+ }
+}
+#endif
+
#define MAX_SHERMAN_LIST_LEN 8
static
}
}
+#if defined(HAVE_AVX512VBMI)
+static
+void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+ char *nfa_base = (char *)nfa;
+ mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+ char *sherman_table = nfa_base + m->sherman_offset;
+
+ assert(ISALIGNED_16(sherman_table));
+ for (size_t i = 0; i < info.size(); i++) {
+ if (!info.is_sherman(i)) {
+ continue;
+ }
+ u16 fs = verify_u16(info.implId(i));
+ DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
+
+ assert(fs >= sherman_limit);
+
+ char *curr_sherman_entry
+ = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
+ assert(curr_sherman_entry <= nfa_base + m->length);
+
+ u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
+ assert(len <= 9);
+ dstate_id_t d = info.states[i].daddy;
+
+ *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
+ *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
+ *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
+ u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
+
+ for (u16 s = 0; s < info.impl_alpha_size; s++) {
+ if (info.states[i].next[s] != info.states[d].next[s]) {
+ *(chars++) = (u8)s;
+ }
+ }
+
+ u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
+ for (u16 s = 0; s < info.impl_alpha_size; s++) {
+ if (info.states[i].next[s] != info.states[d].next[s]) {
+ DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
+ info.implId(d),
+ info.implId(info.states[i].next[s]));
+ u16 entry_val = info.implId(info.states[i].next[s]);
+ entry_val |= get_edge_flags64(nfa, entry_val);
+ unaligned_store_u16((u8 *)states++, entry_val);
+ }
+ }
+ }
+}
+
+static
+bytecode_ptr<NFA> mcsheng64Compile16(dfa_info&info, dstate_id_t sheng_end,
+ const map<dstate_id_t, AccelScheme>&accel_escape_info,
+ const Grey &grey) {
+ DEBUG_PRINTF("building mcsheng 64-16\n");
+
+ vector<u32> reports; /* index in ri for the appropriate report list */
+ vector<u32> reports_eod; /* as above */
+ ReportID arb;
+ u8 single;
+
+ assert(info.getAlphaShift() <= 8);
+
+ // Sherman optimization
+ if (info.impl_alpha_size > 16) {
+ u16 total_daddy = 0;
+ for (u32 i = 0; i < info.size(); i++) {
+ find_better_daddy(info, i,
+ is_cyclic_near(info.raw, info.raw.start_anchored),
+ grey);
+ total_daddy += info.extra[i].daddytaken;
+ }
+
+ DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+ info.size() * info.impl_alpha_size, info.size(),
+ info.impl_alpha_size);
+ }
+
+ u16 sherman_limit;
+ if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
+ DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
+ info.size());
+ return nullptr;
+ }
+ u16 count_real_states = sherman_limit - sheng_end;
+
+ auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+ size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
+ * count_real_states;
+
+ size_t aux_size = sizeof(mstate_aux) * info.size();
+
+ size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+ size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+ size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+ + ri->getReportListSize(), 32);
+ size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
+ size_t sherman_size = calcShermanRegionSize(info);
+
+ size_t total_size = sherman_offset + sherman_size;
+
+ accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+ assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+ auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+ mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+ populateBasicInfo64(sizeof(u16), info, total_size, aux_offset, accel_offset,
+ accel_escape_info.size(), arb, single, nfa.get());
+ createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+ /* copy in the mc header information */
+ m->sherman_offset = sherman_offset;
+ m->sherman_end = total_size;
+ m->sherman_limit = sherman_limit;
+
+ DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
+ count_real_states, info.size());
+
+ fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+ sherman_offset - sizeof(NFA), reports, reports_eod,
+ aux_offset + aux_size, *ri);
+
+ fill_in_succ_table_64_16(nfa.get(), info, sheng_end, sherman_limit);
+
+ fill_in_sherman64(nfa.get(), info, sherman_limit);
+
+ return nfa;
+}
+
+static
+void fill_in_succ_table_64_8(NFA *nfa, const dfa_info &info,
+ dstate_id_t sheng_end) {
+ u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng64);
+
+ u8 alphaShift = info.getAlphaShift();
+ assert(alphaShift <= 8);
+
+ for (size_t i = 0; i < info.size(); i++) {
+ assert(!info.is_sherman(i));
+ if (!info.is_normal(i)) {
+ assert(info.implId(i) < sheng_end);
+ continue;
+ }
+ u8 normal_id = verify_u8(info.implId(i) - sheng_end);
+
+ for (size_t s = 0; s < info.impl_alpha_size; s++) {
+ dstate_id_t raw_succ = info.states[i].next[s];
+ succ_table[((size_t)normal_id << alphaShift) + s]
+ = info.implId(raw_succ);
+ }
+ }
+}
+#endif
+
static
void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
const map<dstate_id_t, AccelScheme> &accel_escape_info,
return nfa;
}
+#if defined(HAVE_AVX512VBMI)
+static
+bytecode_ptr<NFA> mcsheng64Compile8(dfa_info &info, dstate_id_t sheng_end,
+ const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+ DEBUG_PRINTF("building mcsheng 64-8\n");
+
+ vector<u32> reports;
+ vector<u32> reports_eod;
+ ReportID arb;
+ u8 single;
+
+ auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+ size_t normal_count = info.size() - sheng_end;
+
+ size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
+ size_t aux_size = sizeof(mstate_aux) * info.size();
+ size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+ size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+ size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+ + ri->getReportListSize(), 32);
+ size_t total_size = accel_offset + accel_size;
+
+ DEBUG_PRINTF("aux_size %zu\n", aux_size);
+ DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
+ DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
+ DEBUG_PRINTF("accel_size %zu\n", accel_size);
+ DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
+ DEBUG_PRINTF("total_size %zu\n", total_size);
+
+ accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+ assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+ auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+ mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+ allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
+ &m->accept_limit_8);
+
+ populateBasicInfo64(sizeof(u8), info, total_size, aux_offset, accel_offset,
+ accel_escape_info.size(), arb, single, nfa.get());
+ createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+ fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+ total_size - sizeof(NFA), reports, reports_eod,
+ aux_offset + aux_size, *ri);
+
+ fill_in_succ_table_64_8(nfa.get(), info, sheng_end);
+ DEBUG_PRINTF("rl size %zu\n", ri->size());
+
+ return nfa;
+}
+#endif
+
bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
const ReportManager &rm) {
if (!cc.grey.allowMcSheng) {
map<dstate_id_t, AccelScheme> accel_escape_info
= info.strat.getAccelInfo(cc.grey);
+ auto old_states = info.states;
+ dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info, MAX_SHENG_STATES);
- dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
if (sheng_end <= DEAD_STATE + 1) {
+ info.states = old_states;
return nullptr;
}
bytecode_ptr<NFA> nfa;
+
if (!using8bit) {
nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
} else {
nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
}
+ if (!nfa) {
+ info.states = old_states;
+ return nfa;
+ }
+
+ if (has_eod_reports) {
+ nfa->flags |= NFA_ACCEPTS_EOD;
+ }
+
+ DEBUG_PRINTF("compile done\n");
+ return nfa;
+}
+
+#if defined(HAVE_AVX512VBMI)
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+ const ReportManager &rm) {
+ if (!cc.grey.allowMcSheng) {
+ return nullptr;
+ }
+
+ mcclellan_build_strat mbs(raw, rm, false);
+ dfa_info info(mbs);
+ bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
+
+ if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+ * mode with our semantics */
+ raw.stripExtraEodReports();
+ }
+
+ bool has_eod_reports = raw.hasEodReports();
+
+ map<dstate_id_t, AccelScheme> accel_escape_info
+ = info.strat.getAccelInfo(cc.grey);
+ bool using64state = false; /*default flag*/
+ dstate_id_t sheng_end64;
+ sheng_end64 = find_sheng_states(info, accel_escape_info, MAX_SHENG64_STATES);
+
+ if (sheng_end64 <= DEAD_STATE + 1) {
+ return nullptr;
+ } else {
+ using64state = true;
+ }
+
+ bytecode_ptr<NFA> nfa;
+
+ if (using64state) {
+ assert((sheng_end64 > 17) && (sheng_end64 <= 65));
+ if (!using8bit) {
+ nfa = mcsheng64Compile16(info, sheng_end64, accel_escape_info, cc.grey);
+ } else {
+ assert(using8bit);
+ nfa = mcsheng64Compile8(info, sheng_end64, accel_escape_info);
+ assert(nfa);
+ assert(nfa->type == MCSHENG_64_NFA_8);
+ }
+ }
+
if (!nfa) {
return nfa;
}
DEBUG_PRINTF("compile done\n");
return nfa;
}
+#endif
bool has_accel_mcsheng(const NFA *) {
return true; /* consider the sheng region as accelerated */
/*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
}
}
+#if defined(HAVE_AVX512VBMI)
+static
+const mstate_aux *getAux64(const NFA *n, dstate_id_t i) {
+ auto *m = (const mcsheng64 *)getImplNfa(n);
+ auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
+
+ const mstate_aux *aux = aux_base + i;
+
+ assert((const char *)aux < (const char *)n + m->length);
+ return aux;
+}
+
+static
+void next_states64(const NFA *n, u16 s, u16 *t) {
+ const mcsheng64 *m = (const mcsheng64 *)getImplNfa(n);
+ const mstate_aux *aux = getAux64(n, s);
+ const u32 as = m->alphaShift;
+ assert(s != DEAD_STATE);
+
+ if (s < m->sheng_end) {
+ for (u16 c = 0; c < N_CHARS; c++) {
+ u8 sheng_s = s - 1;
+ auto trans_for_c = (const char *)&m->sheng_succ_masks[c];
+ assert(sheng_s < sizeof(m512));
+ u8 raw_succ = trans_for_c[sheng_s];
+ if (raw_succ == m->sheng_end - 1) {
+ t[c] = DEAD_STATE;
+ } else if (raw_succ < m->sheng_end) {
+ t[c] = raw_succ + 1;
+ } else {
+ t[c] = raw_succ;
+ }
+ }
+ } else if (n->type == MCSHENG_64_NFA_8) {
+ const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng64));
+ for (u16 c = 0; c < N_CHARS; c++) {
+ u32 normal_id = s - m->sheng_end;
+ t[c] = succ_table[(normal_id << as) + m->remap[c]];
+ }
+ } else {
+ u16 base_s = s;
+ const char *winfo_base = (const char *)n + m->sherman_offset;
+ const char *state_base
+ = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
+
+ if (s >= m->sherman_limit) {
+ base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
+ assert(base_s >= m->sheng_end);
+ }
+
+ const u16 *succ_table = (const u16 *)((const char *)m
+ + sizeof(mcsheng64));
+ for (u16 c = 0; c < N_CHARS; c++) {
+ u32 normal_id = base_s - m->sheng_end;
+ t[c] = succ_table[(normal_id << as) + m->remap[c]];
+ }
+
+ if (s >= m->sherman_limit) {
+ UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
+ assert(type == SHERMAN_STATE);
+ u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
+ const char *chars = state_base + SHERMAN_CHARS_OFFSET;
+ const u16 *states = (const u16 *)(state_base
+ + SHERMAN_STATES_OFFSET(len));
+
+ for (u8 i = 0; i < len; i++) {
+ for (u16 c = 0; c < N_CHARS; c++) {
+ if (m->remap[c] == chars[i]) {
+ t[c] = unaligned_load_u16((const u8*)&states[i]);
+ }
+ }
+ }
+ }
+
+ for (u16 c = 0; c < N_CHARS; c++) {
+ t[c] &= STATE_MASK;
+ }
+
+ }
+
+ t[TOP] = aux->top & STATE_MASK;
+}
+
+static
+void describeEdge64(FILE *f, const mcsheng64 *m, const u16 *t, u16 i) {
+ for (u16 s = 0; s < N_CHARS; s++) {
+ if (!t[s]) {
+ continue;
+ }
+
+ u16 ss;
+ for (ss = 0; ss < s; ss++) {
+ if (t[s] == t[ss]) {
+ break;
+ }
+ }
+
+ if (ss != s) {
+ continue;
+ }
+
+ CharReach reach;
+ for (ss = s; ss < 256; ss++) {
+ if (t[s] == t[ss]) {
+ reach.set(ss);
+ }
+ }
+
+ fprintf(f, "%u -> %u [ ", i, t[s]);
+ if (i < m->sheng_end && t[s] < m->sheng_end) {
+ fprintf(f, "color = red, fontcolor = red ");
+ }
+ fprintf(f, "label = \"");
+ describeClass(f, reach, 5, CC_OUT_DOT);
+
+ fprintf(f, "\" ];\n");
+ }
+}
+#endif
+
static
void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
switch(accel->accel_type) {
}
+#if defined(HAVE_AVX512VBMI)
+static
+void describeNode64(const NFA *n, const mcsheng64 *m, u16 i, FILE *f) {
+ const mstate_aux *aux = getAux64(n, i);
+
+ bool isSherman = m->sherman_limit && i >= m->sherman_limit;
+
+ fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+ "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
+
+ if (aux->accel_offset) {
+ dumpAccelDot(f, i, (const union AccelAux *)
+ ((const char *)m + aux->accel_offset));
+ }
+
+ if (i && i < m->sheng_end) {
+ fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
+ }
+
+ if (aux->accept_eod) {
+ fprintf(f, "%u [ color = darkorchid ];\n", i);
+ }
+
+ if (aux->accept) {
+ fprintf(f, "%u [ shape = doublecircle ];\n", i);
+ }
+
+ if (aux->top && aux->top != i) {
+ fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+ aux->top);
+ }
+
+ if (i == m->start_anchored) {
+ fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+ }
+
+ if (i == m->start_floating) {
+ fprintf(f, "STARTF -> %u [color = red ]\n", i);
+ }
+
+ if (isSherman) {
+ const char *winfo_base = (const char *)n + m->sherman_offset;
+ const char *state_base
+ = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
+ assert(state_base < (const char *)m + m->length - sizeof(NFA));
+ UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
+ assert(type == SHERMAN_STATE);
+ fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
+ u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+ if (daddy) {
+ fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
+ i, daddy);
+ }
+ }
+
+ if (i && i < m->sheng_end) {
+ fprintf(f, "subgraph cluster_sheng { %u } \n", i);
+ }
+
+}
+#endif
+
static
void dumpDotPreambleDfa(FILE *f) {
dumpDotPreamble(f);
dumpTextReverse(nfa, f);
}
+#if defined(HAVE_AVX512VBMI)
+static
+void dump64_dot_16(const NFA *nfa, FILE *f) {
+ auto *m = (const mcsheng64 *)getImplNfa(nfa);
+
+ dumpDotPreambleDfa(f);
+
+ for (u16 i = 1; i < m->state_count; i++) {
+ describeNode64(nfa, m, i, f);
+
+ u16 t[ALPHABET_SIZE];
+
+ next_states64(nfa, i, t);
+
+ describeEdge64(f, m, t, i);
+ }
+
+ fprintf(f, "}\n");
+}
+
+static
+void dump64_dot_8(const NFA *nfa, FILE *f) {
+ auto m = (const mcsheng64 *)getImplNfa(nfa);
+
+ dumpDotPreambleDfa(f);
+
+ for (u16 i = 1; i < m->state_count; i++) {
+ describeNode64(nfa, m, i, f);
+
+ u16 t[ALPHABET_SIZE];
+
+ next_states64(nfa, i, t);
+
+ describeEdge64(f, m, t, i);
+ }
+
+ fprintf(f, "}\n");
+}
+
+static
+void dumpAccelMasks64(FILE *f, const mcsheng64 *m, const mstate_aux *aux) {
+ fprintf(f, "\n");
+ fprintf(f, "Acceleration\n");
+ fprintf(f, "------------\n");
+
+ for (u16 i = 0; i < m->state_count; i++) {
+ if (!aux[i].accel_offset) {
+ continue;
+ }
+
+ auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
+ fprintf(f, "%05hu ", i);
+ dumpAccelInfo(f, *accel);
+ }
+}
+
+static
+void describeAlphabet64(FILE *f, const mcsheng64 *m) {
+ map<u8, CharReach> rev;
+
+ for (u16 i = 0; i < N_CHARS; i++) {
+ rev[m->remap[i]].clear();
+ }
+
+ for (u16 i = 0; i < N_CHARS; i++) {
+ rev[m->remap[i]].set(i);
+ }
+
+ map<u8, CharReach>::const_iterator it;
+ fprintf(f, "\nAlphabet\n");
+ for (it = rev.begin(); it != rev.end(); ++it) {
+ fprintf(f, "%3hhu: ", it->first);
+ describeClass(f, it->second, 10240, CC_OUT_TEXT);
+ fprintf(f, "\n");
+ }
+ fprintf(f, "\n");
+}
+
+static
+void dumpCommonHeader64(FILE *f, const mcsheng64 *m) {
+ fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
+ m->state_count, m->length);
+ fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
+ m->start_floating);
+ fprintf(f, "single accept: %d, has_accel: %d\n",
+ !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
+ fprintf(f, "sheng_end: %hu\n", m->sheng_end);
+ fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
+}
+
+static
+void dump64_text_8(const NFA *nfa, FILE *f) {
+ auto m = (const mcsheng64 *)getImplNfa(nfa);
+ auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+ fprintf(f, "mcsheng 64-8\n");
+ dumpCommonHeader64(f, m);
+ fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
+ m->accept_limit_8);
+ fprintf(f, "\n");
+
+ describeAlphabet64(f, m);
+ dumpAccelMasks64(f, m, aux);
+
+ fprintf(f, "\n");
+ dumpTextReverse(nfa, f);
+}
+
+static
+void dump64_text_16(const NFA *nfa, FILE *f) {
+ auto *m = (const mcsheng64 *)getImplNfa(nfa);
+ auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+ fprintf(f, "mcsheng 64-16\n");
+ dumpCommonHeader64(f, m);
+ fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
+ (int)m->sherman_end);
+ fprintf(f, "\n");
+
+ describeAlphabet64(f, m);
+ dumpAccelMasks64(f, m, aux);
+
+ fprintf(f, "\n");
+ dumpTextReverse(nfa, f);
+}
+#endif
+
void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
assert(nfa->type == MCSHENG_NFA_16);
dump_text_16(nfa, StdioFile(base + ".txt", "w"));
dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
}
+void nfaExecMcSheng64_16_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+#if defined(HAVE_AVX512VBMI)
+ assert(nfa->type == MCSHENG_64_NFA_16);
+ dump64_text_16(nfa, StdioFile(base + ".txt", "w"));
+ dump64_dot_16(nfa, StdioFile(base + ".dot", "w"));
+#endif
+}
+
+void nfaExecMcSheng64_8_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+#if defined(HAVE_AVX512VBMI)
+ assert(nfa->type == MCSHENG_64_NFA_8);
+ dump64_text_8(nfa, StdioFile(base + ".txt", "w"));
+ dump64_dot_8(nfa, StdioFile(base + ".dot", "w"));
+#endif
+}
+
} // namespace ue2