return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
}
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
static really_inline
u8 isDeadState32(const u8 a) {
return a & SHENG32_STATE_DEAD;
#define SHENG_IMPL sheng_cod
#define DEAD_FUNC isDeadState
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_cod
#define DEAD_FUNC32 isDeadState32
#define ACCEPT_FUNC32 isAcceptState32
#undef SHENG_IMPL
#undef DEAD_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#define SHENG_IMPL sheng_co
#define DEAD_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_co
#define DEAD_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#undef SHENG_IMPL
#undef DEAD_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#define SHENG_IMPL sheng_samd
#define DEAD_FUNC isDeadState
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_samd
#define DEAD_FUNC32 isDeadState32
#define ACCEPT_FUNC32 isAcceptState32
#undef SHENG_IMPL
#undef DEAD_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#define SHENG_IMPL sheng_sam
#define DEAD_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_sam
#define DEAD_FUNC32 dummyFunc
#define ACCEPT_FUNC32 isAcceptState32
#undef SHENG_IMPL
#undef DEAD_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#define SHENG_IMPL sheng_nmd
#define DEAD_FUNC isDeadState
#define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_nmd
#define DEAD_FUNC32 isDeadState32
#define ACCEPT_FUNC32 dummyFunc
#undef SHENG_IMPL
#undef DEAD_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#define SHENG_IMPL sheng_nm
#define DEAD_FUNC dummyFunc
#define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_nm
#define DEAD_FUNC32 dummyFunc
#define ACCEPT_FUNC32 dummyFunc
#undef SHENG_IMPL
#undef DEAD_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef DEAD_FUNC32
#undef ACCEPT_FUNC32
#define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_coda
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_cod
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_coa
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_co
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_samda
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_samd
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 isDeadState32
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC isAccelState
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_sama
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC isAcceptState
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_sam
#define INTERESTING_FUNC32 hasInterestingStates32
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC isAccelState
#define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_nmda
#define INTERESTING_FUNC32 dummyFunc4
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_nmd
#define INTERESTING_FUNC32 dummyFunc4
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
#define INNER_ACCEL_FUNC dummyFunc
#define OUTER_ACCEL_FUNC dummyFunc
#define ACCEPT_FUNC dummyFunc
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#define SHENG32_IMPL sheng32_4_nm
#define INTERESTING_FUNC32 dummyFunc4
#define INNER_DEAD_FUNC32 dummyFunc
#undef INNER_ACCEL_FUNC
#undef OUTER_ACCEL_FUNC
#undef ACCEPT_FUNC
-#if defined(HAVE_AVX512VBMI)
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE)
#undef SHENG32_IMPL
#undef INTERESTING_FUNC32
#undef INNER_DEAD_FUNC32
return MO_CONTINUE_MATCHING;
}
+#if defined(HAVE_SVE)
+
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+ const struct sheng32 *s,
+ u8 *const cached_accept_state,
+ ReportID *const cached_accept_id,
+ u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+ const u8 *end, const u8 **scan_end) {
+ DEBUG_PRINTF("Starting DFA execution in state %u\n",
+ *state & SHENG32_STATE_MASK);
+ const u8 *cur_buf = start;
+ if (DEAD_FUNC32(*state)) {
+ DEBUG_PRINTF("Dead on arrival\n");
+ *scan_end = end;
+ return MO_CONTINUE_MATCHING;
+ }
+ DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+ const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+ svuint8_t cur_state = svld1(lane_pred_32, state);
+ const m512 *masks = s->succ_masks;
+
+ while (likely(cur_buf != end)) {
+ const u8 c = *cur_buf;
+ svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c));
+ cur_state = svtbl(cur_state, succ_mask);
+ const u8 tmp = svlastb(lane_pred_32, cur_state);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
+ tmp & SHENG32_STATE_FLAG_MASK);
+
+ if (unlikely(ACCEPT_FUNC32(tmp))) {
+ DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
+ u64a match_offset = base_offset + (cur_buf - buf) + 1;
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (u64a)(cur_buf - start));
+ *state = tmp;
+ *scan_end = cur_buf;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports32(s, cb, ctxt, tmp, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ cur_buf++;
+ }
+ *state = svlastb(lane_pred_32, cur_state);
+ *scan_end = cur_buf;
+ return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+ const struct sheng64 *s,
+ u8 *const cached_accept_state,
+ ReportID *const cached_accept_id,
+ u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+ const u8 *end, const u8 **scan_end) {
+ DEBUG_PRINTF("Starting DFA execution in state %u\n",
+ *state & SHENG64_STATE_MASK);
+ const u8 *cur_buf = start;
+ if (DEAD_FUNC64(*state)) {
+ DEBUG_PRINTF("Dead on arrival\n");
+ *scan_end = end;
+ return MO_CONTINUE_MATCHING;
+ }
+ DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+ const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
+ svuint8_t cur_state = svld1(lane_pred_64, state);
+ const m512 *masks = s->succ_masks;
+
+ while (likely(cur_buf != end)) {
+ const u8 c = *cur_buf;
+ svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c));
+ cur_state = svtbl(cur_state, succ_mask);
+ const u8 tmp = svlastb(lane_pred_64, cur_state);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
+ tmp & SHENG64_STATE_FLAG_MASK);
+
+ if (unlikely(ACCEPT_FUNC64(tmp))) {
+ DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
+ u64a match_offset = base_offset + (cur_buf - buf) + 1;
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (u64a)(cur_buf - start));
+ *state = tmp;
+ *scan_end = cur_buf;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports64(s, cb, ctxt, tmp, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ cur_buf++;
+ }
+ *state = svlastb(lane_pred_64, cur_state);
+ *scan_end = cur_buf;
+ return MO_CONTINUE_MATCHING;
+}
+#endif
+
#if defined(HAVE_AVX512VBMI)
static really_inline
char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
return MO_CONTINUE_MATCHING;
}
+#if defined(HAVE_SVE)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+ const struct sheng32 *s,
+ u8 *const cached_accept_state,
+ ReportID *const cached_accept_id,
+ u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+ const u8 *end, const u8 **scan_end) {
+ DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+ *state & SHENG32_STATE_MASK);
+ const u8 *cur_buf = start;
+ const u8 *min_accel_dist = start;
+ base_offset++;
+ DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+ if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
+ DEBUG_PRINTF("Accel state reached @ 0\n");
+ const union AccelAux *aaux =
+ get_accel32(s, *state & SHENG32_STATE_MASK);
+ const u8 *new_offset = run_accel(aaux, cur_buf, end);
+ if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+ min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+ } else {
+ min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+ }
+ DEBUG_PRINTF("Next accel chance: %llu\n",
+ (u64a)(min_accel_dist - start));
+ DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+ cur_buf = new_offset;
+ DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+ }
+ if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
+ DEBUG_PRINTF("Dead on arrival\n");
+ *scan_end = end;
+ return MO_CONTINUE_MATCHING;
+ }
+
+ const svbool_t lane_pred_32 = svwhilelt_b8(0, 32);
+ svuint8_t cur_state = svld1(lane_pred_32, state);
+ const m512 *masks = s->succ_masks;
+
+ while (likely(end - cur_buf >= 4)) {
+ const u8 *b1 = cur_buf;
+ const u8 *b2 = cur_buf + 1;
+ const u8 *b3 = cur_buf + 2;
+ const u8 *b4 = cur_buf + 3;
+ const u8 c1 = *b1;
+ const u8 c2 = *b2;
+ const u8 c3 = *b3;
+ const u8 c4 = *b4;
+ svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1));
+ cur_state = svtbl(cur_state, succ_mask1);
+ const u8 a1 = svlastb(lane_pred_32, cur_state);
+
+ svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2));
+ cur_state = svtbl(cur_state, succ_mask2);
+ const u8 a2 = svlastb(lane_pred_32, cur_state);
+
+ svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3));
+ cur_state = svtbl(cur_state, succ_mask3);
+ const u8 a3 = svlastb(lane_pred_32, cur_state);
+
+ svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4));
+ cur_state = svtbl(cur_state, succ_mask4);
+ const u8 a4 = svlastb(lane_pred_32, cur_state);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
+ a1 & SHENG32_STATE_FLAG_MASK);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
+ a2 & SHENG32_STATE_FLAG_MASK);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
+ a3 & SHENG32_STATE_FLAG_MASK);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
+ a4 & SHENG32_STATE_FLAG_MASK);
+
+ if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
+ if (ACCEPT_FUNC32(a1)) {
+ u64a match_offset = base_offset + b1 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a1 & SHENG32_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b1 - start));
+ *scan_end = b1;
+ *state = a1;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports32(s, cb, ctxt, a1, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (ACCEPT_FUNC32(a2)) {
+ u64a match_offset = base_offset + b2 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a2 & SHENG32_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b2 - start));
+ *scan_end = b2;
+ *state = a2;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports32(s, cb, ctxt, a2, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (ACCEPT_FUNC32(a3)) {
+ u64a match_offset = base_offset + b3 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a3 & SHENG32_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b3 - start));
+ *scan_end = b3;
+ *state = a3;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports32(s, cb, ctxt, a3, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (ACCEPT_FUNC32(a4)) {
+ u64a match_offset = base_offset + b4 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a4 & SHENG32_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b4 - start));
+ *scan_end = b4;
+ *state = a4;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports32(s, cb, ctxt, a4, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (INNER_DEAD_FUNC32(a4)) {
+ DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+ *scan_end = end;
+ *state = a4;
+ return MO_CONTINUE_MATCHING;
+ }
+ if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
+ DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+ const union AccelAux *aaux =
+ get_accel32(s, a4 & SHENG32_STATE_MASK);
+ const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+ if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+ min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+ } else {
+ min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+ }
+ DEBUG_PRINTF("Next accel chance: %llu\n",
+ (u64a)(min_accel_dist - start));
+ DEBUG_PRINTF("Accel scanned %llu bytes\n",
+ (u64a)(new_offset - cur_buf - 4));
+ cur_buf = new_offset;
+ DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+ continue;
+ }
+ }
+ if (OUTER_DEAD_FUNC32(a4)) {
+ DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+ *scan_end = end;
+ *state = a4;
+ return MO_CONTINUE_MATCHING;
+ };
+ if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
+ DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+ const union AccelAux *aaux =
+ get_accel32(s, a4 & SHENG32_STATE_MASK);
+ const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+ if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+ min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+ } else {
+ min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+ }
+ DEBUG_PRINTF("Next accel chance: %llu\n",
+ (u64a)(min_accel_dist - start));
+ DEBUG_PRINTF("Accel scanned %llu bytes\n",
+ (u64a)(new_offset - cur_buf - 4));
+ cur_buf = new_offset;
+ DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+ continue;
+ };
+ cur_buf += 4;
+ }
+ *state = svlastb(lane_pred_32, cur_state);
+ *scan_end = cur_buf;
+ return MO_CONTINUE_MATCHING;
+}
+
+#if !defined(NO_SHENG64_IMPL)
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+ const struct sheng64 *s,
+ u8 *const cached_accept_state,
+ ReportID *const cached_accept_id,
+ u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+ const u8 *end, const u8 **scan_end) {
+ DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+ *state & SHENG64_STATE_MASK);
+ const u8 *cur_buf = start;
+ base_offset++;
+ DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+ if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
+ DEBUG_PRINTF("Dead on arrival\n");
+ *scan_end = end;
+ return MO_CONTINUE_MATCHING;
+ }
+
+ const svbool_t lane_pred_64 = svwhilelt_b8(0, 64);
+ svuint8_t cur_state = svld1(lane_pred_64, state);
+ const m512 *masks = s->succ_masks;
+
+ while (likely(end - cur_buf >= 4)) {
+ const u8 *b1 = cur_buf;
+ const u8 *b2 = cur_buf + 1;
+ const u8 *b3 = cur_buf + 2;
+ const u8 *b4 = cur_buf + 3;
+ const u8 c1 = *b1;
+ const u8 c2 = *b2;
+ const u8 c3 = *b3;
+ const u8 c4 = *b4;
+
+ svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1));
+ cur_state = svtbl(cur_state, succ_mask1);
+ const u8 a1 = svlastb(lane_pred_64, cur_state);
+
+ svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2));
+ cur_state = svtbl(cur_state, succ_mask2);
+ const u8 a2 = svlastb(lane_pred_64, cur_state);
+
+ svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3));
+ cur_state = svtbl(cur_state, succ_mask3);
+ const u8 a3 = svlastb(lane_pred_64, cur_state);
+
+ svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4));
+ cur_state = svtbl(cur_state, succ_mask4);
+ const u8 a4 = svlastb(lane_pred_64, cur_state);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
+ a1 & SHENG64_STATE_FLAG_MASK);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
+ a2 & SHENG64_STATE_FLAG_MASK);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
+ a3 & SHENG64_STATE_FLAG_MASK);
+
+ DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+ DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
+ a4 & SHENG64_STATE_FLAG_MASK);
+
+ if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
+ if (ACCEPT_FUNC64(a1)) {
+ u64a match_offset = base_offset + b1 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a1 & SHENG64_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b1 - start));
+ *scan_end = b1;
+ *state = a1;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports64(s, cb, ctxt, a1, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (ACCEPT_FUNC64(a2)) {
+ u64a match_offset = base_offset + b2 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a2 & SHENG64_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b2 - start));
+ *scan_end = b2;
+ *state = a2;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports64(s, cb, ctxt, a2, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (ACCEPT_FUNC64(a3)) {
+ u64a match_offset = base_offset + b3 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a3 & SHENG64_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b3 - start));
+ *scan_end = b3;
+ *state = a3;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports64(s, cb, ctxt, a3, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (ACCEPT_FUNC64(a4)) {
+ u64a match_offset = base_offset + b4 - buf;
+ DEBUG_PRINTF("Accept state %u reached\n",
+ a4 & SHENG64_STATE_MASK);
+ DEBUG_PRINTF("Match @ %llu\n", match_offset);
+ if (STOP_AT_MATCH) {
+ DEBUG_PRINTF("Stopping at match @ %lli\n",
+ (s64a)(b4 - start));
+ *scan_end = b4;
+ *state = a4;
+ return MO_MATCHES_PENDING;
+ }
+ if (single) {
+ if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+ MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ } else {
+ if (fireReports64(s, cb, ctxt, a4, match_offset,
+ cached_accept_state, cached_accept_id,
+ 0) == MO_HALT_MATCHING) {
+ return MO_HALT_MATCHING;
+ }
+ }
+ }
+ if (INNER_DEAD_FUNC64(a4)) {
+ DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+ *scan_end = end;
+ *state = a4;
+ return MO_CONTINUE_MATCHING;
+ }
+ }
+ if (OUTER_DEAD_FUNC64(a4)) {
+ DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+ *scan_end = end;
+ *state = a4;
+ return MO_CONTINUE_MATCHING;
+ }
+ cur_buf += 4;
+ }
+ *state = svlastb(lane_pred_64, cur_state);
+ *scan_end = cur_buf;
+ return MO_CONTINUE_MATCHING;
+}
+#endif
+#endif
+
#if defined(HAVE_AVX512VBMI)
static really_inline
char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,