/*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
using namespace std;
using namespace ue2;
+using MatchSet = set<pair<size_t, size_t>>;
+using StateBitSet = boost::dynamic_bitset<>;
+
namespace {
+// returns all successors up to a given depth in a vector of sets, indexed by
+// zero-based depth from source vertex
+static
+vector<flat_set<NFAVertex>>
+gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) {
+ vector<flat_set<NFAVertex>> result(depth);
+ flat_set<NFAVertex> cur, next;
+
+ // populate current set of successors
+ for (auto v : adjacent_vertices_range(src, g)) {
+ // ignore self-loops
+ if (src == v) {
+ continue;
+ }
+ DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
+
+ cur.insert(v);
+ }
+ result[0] = cur;
+
+ for (u32 d = 1; d < depth; d++) {
+ // collect all successors for all current level vertices
+ for (auto v : cur) {
+ // don't go past special nodes
+ if (is_special(v, g)) {
+ continue;
+ }
+
+ for (auto succ : adjacent_vertices_range(v, g)) {
+ // ignore self-loops
+ if (v == succ) {
+ continue;
+ }
+ DEBUG_PRINTF("Node %zu depth %u\n", g[succ].index, d + 1);
+ next.insert(succ);
+ }
+ }
+ result[d] = next;
+ next.swap(cur);
+ next.clear();
+ }
+
+ return result;
+}
+
+// returns all predecessors up to a given depth in a vector of sets, indexed by
+// zero-based depth from source vertex
+static
+vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g,
+ NFAVertex src, u32 depth) {
+ vector<flat_set<NFAVertex>> result(depth);
+ flat_set<NFAVertex> cur, next;
+
+ assert(depth > 0);
+
+ // populate current set of successors
+ for (auto v : inv_adjacent_vertices_range(src, g)) {
+ // ignore self-loops
+ if (src == v) {
+ continue;
+ }
+ DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
+ cur.insert(v);
+ }
+ result[0] = cur;
+
+ for (u32 d = 1; d < depth; d++) {
+ // collect all successors for all current level vertices
+ for (auto v : cur) {
+ for (auto pred : inv_adjacent_vertices_range(v, g)) {
+ // ignore self-loops
+ if (v == pred) {
+ continue;
+ }
+ DEBUG_PRINTF("Node %zu depth %u\n", g[pred].index, d + 1);
+ next.insert(pred);
+ }
+ }
+ result[d] = next;
+ next.swap(cur);
+ next.clear();
+ }
+
+ return result;
+}
+
+// this is a per-vertex, per-shadow level state transition table
+struct GraphCache {
+ GraphCache(u32 dist_in, const NGHolder &g) :
+ size(num_vertices(g)), edit_distance(dist_in)
+ {
+ auto dist_max = edit_distance + 1;
+
+ allocateStateTransitionTable(dist_max);
+ populateTransitionCache(g, dist_max);
+ populateAcceptCache(g, dist_max);
+ }
+
+ void allocateStateTransitionTable(u32 dist_max) {
+ // resize level 1 - per vertex
+ shadow_transitions.resize(size);
+ helper_transitions.resize(size);
+
+ // resize level 2 - per shadow level
+ for (u32 i = 0; i < size; i++) {
+ shadow_transitions[i].resize(dist_max);
+ helper_transitions[i].resize(dist_max);
+
+ // resize level 3 - per vertex
+ for (u32 d = 0; d < dist_max; d++) {
+ shadow_transitions[i][d].resize(size);
+ helper_transitions[i][d].resize(size);
+ }
+ }
+
+ // accept states are indexed by edit distance
+ accept_states.resize(dist_max);
+ accept_eod_states.resize(dist_max);
+
+ // vertex report maps are indexed by edit distance
+ vertex_reports_by_level.resize(dist_max);
+ vertex_eod_reports_by_level.resize(dist_max);
+ }
+
+ /*
+ * certain transitions to helpers are disallowed:
+ * 1. transitions from accept/acceptEod
+ * 2. transitions to accept/acceptEod
+ * 3. from start to startDs
+ * 4. to a virtual/multiline start
+ *
+ * everything else is allowed.
+ */
+ bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const {
+ if (is_any_accept(u, g)) {
+ return false;
+ }
+ if (is_any_accept(v, g)) {
+ return false;
+ }
+ if (u == g.start && v == g.startDs) {
+ return false;
+ }
+ if (is_virtual_start(v, g)) {
+ return false;
+ }
+ return true;
+ }
+
+ void populateTransitionCache(const NGHolder &g, u32 dist_max) {
+ // populate mapping of vertex index to vertex
+ vector<NFAVertex> idx_to_v(size);
+ for (auto v : vertices_range(g)) {
+ idx_to_v[g[v].index] = v;
+ }
+
+ for (u32 i = 0; i < size; i++) {
+ auto cur_v = idx_to_v[i];
+
+ // set up transition tables
+ auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max);
+
+ assert(succs.size() == dist_max);
+
+ for (u32 d = 0; d < dist_max; d++) {
+ auto &v_shadows = shadow_transitions[i][d];
+ auto cur_v_bit = i;
+
+ // enable transition to next level helper (this handles insertion)
+ if (d < edit_distance && !is_any_accept(cur_v, g)) {
+ auto &next_v_helpers = helper_transitions[i][d + 1];
+
+ next_v_helpers.set(cur_v_bit);
+ }
+
+ // if vertex has a self-loop, we can also transition to it,
+ // but only if we're at shadow level 0
+ if (edge(cur_v, cur_v, g).second && d == 0) {
+ v_shadows.set(cur_v_bit);
+ }
+
+ // populate state transition tables
+ for (auto v : succs[d]) {
+ auto v_bit = g[v].index;
+
+ // we cannot transition to startDs on any level other than
+ // level 0
+ if (v != g.startDs || d == 0) {
+ // this handles direct transitions as well as removals
+ v_shadows.set(v_bit);
+ }
+
+ // we can also transition to next-level helper (handles
+ // replace), provided we meet the criteria
+ if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) {
+ auto &next_v_helpers = helper_transitions[i][d + 1];
+
+ next_v_helpers.set(v_bit);
+ }
+ }
+ }
+ }
+ }
+
+ void populateAcceptCache(const NGHolder &g, u32 dist_max) {
+ // set up accept states masks
+ StateBitSet accept(size);
+ accept.set(g[g.accept].index);
+ StateBitSet accept_eod(size);
+ accept_eod.set(g[g.acceptEod].index);
+
+ // gather accept and acceptEod states
+ for (u32 base_dist = 0; base_dist < dist_max; base_dist++) {
+ auto &states = accept_states[base_dist];
+ auto &eod_states = accept_eod_states[base_dist];
+
+ states.resize(size);
+ eod_states.resize(size);
+
+ // inspect each vertex
+ for (u32 i = 0; i < size; i++) {
+ // inspect all shadow levels from base_dist to dist_max
+ for (u32 d = 0; d < dist_max - base_dist; d++) {
+ auto &shadows = shadow_transitions[i][d];
+
+ // if this state transitions to accept, set its bit
+ if ((shadows & accept).any()) {
+ states.set(i);
+ }
+ if ((shadows & accept_eod).any()) {
+ eod_states.set(i);
+ }
+ }
+ }
+ }
+
+ // populate accepts cache
+ for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+ const auto &rs = g[v].reports;
+
+ for (u32 d = 0; d <= edit_distance; d++) {
+ // add self to report list at all levels
+ vertex_reports_by_level[d][v].insert(rs.begin(), rs.end());
+ }
+ if (edit_distance == 0) {
+ // if edit distance is 0, no predecessors will have reports
+ continue;
+ }
+
+ auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance);
+ for (u32 pd = 0; pd < preds_by_depth.size(); pd++) {
+ const auto &preds = preds_by_depth[pd];
+ // for each predecessor, add reports up to maximum edit distance
+ // for current depth from source vertex
+ for (auto pred : preds) {
+ for (u32 d = 0; d < edit_distance - pd; d++) {
+ vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end());
+ }
+ }
+ }
+ }
+ for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+ const auto &rs = g[v].reports;
+
+ if (v == g.accept) {
+ continue;
+ }
+
+ for (u32 d = 0; d <= edit_distance; d++) {
+ // add self to report list at all levels
+ vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end());
+ }
+ if (edit_distance == 0) {
+ // if edit distance is 0, no predecessors will have reports
+ continue;
+ }
+
+ auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance);
+ for (u32 pd = 0; pd < preds_by_depth.size(); pd++) {
+ const auto &preds = preds_by_depth[pd];
+ // for each predecessor, add reports up to maximum edit distance
+ // for current depth from source vertex
+ for (auto pred : preds) {
+ for (u32 d = 0; d < edit_distance - pd; d++) {
+ vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end());
+ }
+ }
+ }
+ }
+ }
+
+#ifdef DEBUG
+ void dumpStateTransitionTable(const NGHolder &g) {
+ StateBitSet accept(size);
+ accept.set(g[g.accept].index);
+ StateBitSet accept_eod(size);
+ accept_eod.set(g[g.acceptEod].index);
+
+ DEBUG_PRINTF("Dumping state transition tables\n");
+ DEBUG_PRINTF("Shadows:\n");
+ for (u32 i = 0; i < num_vertices(g); i++) {
+ DEBUG_PRINTF("%-7s %3u:", "Vertex", i);
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", j);
+ }
+ printf("\n");
+ for (u32 d = 0; d <= edit_distance; d++) {
+ DEBUG_PRINTF("%-7s %3u:", "Level", d);
+ const auto &s = getShadowTransitions(i, d);
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", s.test(j));
+ }
+ printf("\n");
+ }
+ DEBUG_PRINTF("\n");
+ }
+
+ DEBUG_PRINTF("Helpers:\n");
+ for (u32 i = 0; i < num_vertices(g); i++) {
+ DEBUG_PRINTF("%-7s %3u:", "Vertex", i);
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", j);
+ }
+ printf("\n");
+ for (u32 d = 0; d <= edit_distance; d++) {
+ DEBUG_PRINTF("%-7s %3u:", "Level", d);
+ const auto &s = getHelperTransitions(i, d);
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", s.test(j));
+ }
+ printf("\n");
+ }
+ DEBUG_PRINTF("\n");
+ }
+
+ DEBUG_PRINTF("Accept transitions:\n");
+ DEBUG_PRINTF("%-12s", "Vertex idx:");
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", j);
+ }
+ printf("\n");
+ for (u32 d = 0; d <= edit_distance; d++) {
+ DEBUG_PRINTF("%-7s %3u:", "Level", d);
+ const auto &s = getAcceptTransitions(d);
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", s.test(j));
+ }
+ printf("\n");
+ }
+ DEBUG_PRINTF("\n");
+
+ DEBUG_PRINTF("Accept EOD transitions:\n");
+ DEBUG_PRINTF("%-12s", "Vertex idx:");
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", j);
+ }
+ printf("\n");
+ for (u32 d = 0; d <= edit_distance; d++) {
+ DEBUG_PRINTF("%-7s %3u:", "Level", d);
+ const auto &s = getAcceptEodTransitions(d);
+ for (u32 j = 0; j < num_vertices(g); j++) {
+ printf("%3i", s.test(j));
+ }
+ printf("\n");
+ }
+ DEBUG_PRINTF("\n");
+
+ DEBUG_PRINTF("%-12s ", "Accepts:");
+ for (u32 i = 0; i < num_vertices(g); i++) {
+ printf("%3i", accept.test(i));
+ }
+ printf("\n");
+
+ DEBUG_PRINTF("%-12s ", "EOD Accepts:");
+ for (u32 i = 0; i < num_vertices(g); i++) {
+ printf("%3i", accept_eod.test(i));
+ }
+ printf("\n");
+
+ DEBUG_PRINTF("Reports\n");
+ for (auto v : vertices_range(g)) {
+ for (u32 d = 0; d <= edit_distance; d++) {
+ const auto &r = vertex_reports_by_level[d][v];
+ const auto &e = vertex_eod_reports_by_level[d][v];
+ DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n",
+ "Vertex", g[v].index, "rs:", r.size(), "eod:", e.size());
+ }
+ }
+ printf("\n");
+ }
+#endif
+
+ const StateBitSet& getShadowTransitions(u32 idx, u32 level) const {
+ assert(idx < size);
+ assert(level <= edit_distance);
+ return shadow_transitions[idx][level];
+ }
+ const StateBitSet& getHelperTransitions(u32 idx, u32 level) const {
+ assert(idx < size);
+ assert(level <= edit_distance);
+ return helper_transitions[idx][level];
+ }
+ const StateBitSet& getAcceptTransitions(u32 level) const {
+ assert(level <= edit_distance);
+ return accept_states[level];
+ }
+ const StateBitSet& getAcceptEodTransitions(u32 level) const {
+ assert(level <= edit_distance);
+ return accept_eod_states[level];
+ }
+
+ /*
+ * the bitsets are indexed by vertex and shadow level. the bitset's length is
+ * equal to the total number of vertices in the graph.
+ *
+ * for convenience, helper functions are provided.
+ */
+ vector<vector<StateBitSet>> shadow_transitions;
+ vector<vector<StateBitSet>> helper_transitions;
+
+ // accept states masks, indexed by shadow level
+ vector<StateBitSet> accept_states;
+ vector<StateBitSet> accept_eod_states;
+
+ // map of all reports associated with any vertex, indexed by shadow level
+ vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level;
+ vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level;
+
+ u32 size;
+ u32 edit_distance;
+};
+
+
+/*
+ * SOM workflow is expected to be the following:
+ * - Caller calls getActiveStates, which reports SOM for each active states
+ * - Caller calls getSuccessorStates on each of the active states, which *doesn't*
+ * report SOM
+ * - Caller decides if the successor state should be activated, and calls
+ * activateState with SOM set to that of previous active state (not successor!)
+ * - activateState then resolves any conflicts between SOMs that may arise from
+ * multiple active states progressing to the same successor
+ */
struct StateSet {
- explicit StateSet(size_t sz) : s(sz), som(sz, 0) {}
- boost::dynamic_bitset<> s; // bitset of states that are on
- vector<size_t> som; // som value for each state
+ struct State {
+ enum node_type {
+ NODE_SHADOW = 0,
+ NODE_HELPER
+ };
+ State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) :
+ idx(idx_in), level(level_in), som(som_in), type(type_in) {}
+ size_t idx;
+ u32 level;
+ size_t som;
+ node_type type;
+ };
+
+ StateSet(size_t sz, u32 dist_in) :
+ shadows(dist_in + 1), helpers(dist_in + 1),
+ shadows_som(dist_in + 1), helpers_som(dist_in + 1),
+ edit_distance(dist_in) {
+ for (u32 dist = 0; dist <= dist_in; dist++) {
+ shadows[dist].resize(sz, false);
+ helpers[dist].resize(sz, false);
+ shadows_som[dist].resize(sz, 0);
+ helpers_som[dist].resize(sz, 0);
+ }
+ }
+
+ void reset() {
+ for (u32 dist = 0; dist <= edit_distance; dist++) {
+ shadows[dist].reset();
+ helpers[dist].reset();
+ fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0);
+ fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0);
+ }
+ }
+
+ bool empty() const {
+ for (u32 dist = 0; dist <= edit_distance; dist++) {
+ if (shadows[dist].any()) {
+ return false;
+ }
+ if (helpers[dist].any()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ size_t count() const {
+ size_t result = 0;
+
+ for (u32 dist = 0; dist <= edit_distance; dist++) {
+ result += shadows[dist].count();
+ result += helpers[dist].count();
+ }
+
+ return result;
+ }
+
+ bool setActive(const State &s) {
+ switch (s.type) {
+ case State::NODE_HELPER:
+ return helpers[s.level].test_set(s.idx);
+ case State::NODE_SHADOW:
+ return shadows[s.level].test_set(s.idx);
+ }
+ assert(0);
+ return false;
+ }
+
+ size_t getCachedSom(const State &s) const {
+ switch (s.type) {
+ case State::NODE_HELPER:
+ return helpers_som[s.level][s.idx];
+ case State::NODE_SHADOW:
+ return shadows_som[s.level][s.idx];
+ }
+ assert(0);
+ return 0;
+ }
+
+ void setCachedSom(const State &s, const size_t som_val) {
+ switch (s.type) {
+ case State::NODE_HELPER:
+ helpers_som[s.level][s.idx] = som_val;
+ break;
+ case State::NODE_SHADOW:
+ shadows_som[s.level][s.idx] = som_val;
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+#ifdef DEBUG
+ void dumpActiveStates() const {
+ const auto states = getActiveStates();
+
+ DEBUG_PRINTF("Dumping active states\n");
+
+ for (const auto &state : states) {
+ DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n",
+ state.type == State::NODE_HELPER ? "HELPER" : "SHADOW",
+ state.idx, state.level, state.som);
+ }
+ }
+#endif
+
+ flat_set<State> getActiveStates() const {
+ flat_set<State> result;
+
+ for (u32 dist = 0; dist <= edit_distance; dist++) {
+ // get all shadow vertices (including original graph)
+ const auto &cur_shadow_vertices = shadows[dist];
+ for (size_t id = cur_shadow_vertices.find_first();
+ id != cur_shadow_vertices.npos;
+ id = cur_shadow_vertices.find_next(id)) {
+ result.emplace(id, dist, shadows_som[dist][id],
+ State::NODE_SHADOW);
+ }
+
+ // the rest is only valid for edited graphs
+ if (dist == 0) {
+ continue;
+ }
+
+ // get all helper vertices
+ const auto &cur_helper_vertices = helpers[dist];
+ for (size_t id = cur_helper_vertices.find_first();
+ id != cur_helper_vertices.npos;
+ id = cur_helper_vertices.find_next(id)) {
+ result.emplace(id, dist, helpers_som[dist][id],
+ State::NODE_HELPER);
+ }
+ }
+
+ return result;
+ }
+
+ // does not return SOM
+ flat_set<State> getSuccessors(const State &state, const GraphCache &gc) const {
+ flat_set<State> result;
+
+ // maximum shadow depth that we can go from current level
+ u32 max_depth = edit_distance - state.level + 1;
+
+ for (u32 d = 0; d < max_depth; d++) {
+ const auto &shadow_succ = gc.getShadowTransitions(state.idx, d);
+ for (size_t id = shadow_succ.find_first();
+ id != shadow_succ.npos;
+ id = shadow_succ.find_next(id)) {
+ auto new_level = state.level + d;
+ result.emplace(id, new_level, 0, State::NODE_SHADOW);
+ }
+
+ const auto &helper_succ = gc.getHelperTransitions(state.idx, d);
+ for (size_t id = helper_succ.find_first();
+ id != helper_succ.npos;
+ id = helper_succ.find_next(id)) {
+ auto new_level = state.level + d;
+ result.emplace(id, new_level, 0, State::NODE_HELPER);
+ }
+ }
+
+ return result;
+ }
+
+ flat_set<State> getAcceptStates(const GraphCache &gc) const {
+ flat_set<State> result;
+
+ for (u32 dist = 0; dist <= edit_distance; dist++) {
+ // get all shadow vertices (including original graph)
+ auto cur_shadow_vertices = shadows[dist];
+ cur_shadow_vertices &= gc.getAcceptTransitions(dist);
+ for (size_t id = cur_shadow_vertices.find_first();
+ id != cur_shadow_vertices.npos;
+ id = cur_shadow_vertices.find_next(id)) {
+ result.emplace(id, dist, shadows_som[dist][id],
+ State::NODE_SHADOW);
+ }
+ auto cur_helper_vertices = helpers[dist];
+ cur_helper_vertices &= gc.getAcceptTransitions(dist);
+ for (size_t id = cur_helper_vertices.find_first();
+ id != cur_helper_vertices.npos;
+ id = cur_helper_vertices.find_next(id)) {
+ result.emplace(id, dist, helpers_som[dist][id],
+ State::NODE_HELPER);
+ }
+ }
+
+ return result;
+ }
+
+ flat_set<State> getAcceptEodStates(const GraphCache &gc) const {
+ flat_set<State> result;
+
+ for (u32 dist = 0; dist <= edit_distance; dist++) {
+ // get all shadow vertices (including original graph)
+ auto cur_shadow_vertices = shadows[dist];
+ cur_shadow_vertices &= gc.getAcceptEodTransitions(dist);
+ for (size_t id = cur_shadow_vertices.find_first();
+ id != cur_shadow_vertices.npos;
+ id = cur_shadow_vertices.find_next(id)) {
+ result.emplace(id, dist, shadows_som[dist][id],
+ State::NODE_SHADOW);
+ }
+ auto cur_helper_vertices = helpers[dist];
+ cur_helper_vertices &= gc.getAcceptEodTransitions(dist);
+ for (size_t id = cur_helper_vertices.find_first();
+ id != cur_helper_vertices.npos;
+ id = cur_helper_vertices.find_next(id)) {
+ result.emplace(id, dist, helpers_som[dist][id],
+ State::NODE_HELPER);
+ }
+ }
+
+ return result;
+ }
+
+ // the caller must specify SOM at current offset, and must not attempt to
+ // resolve SOM inheritance conflicts
+ void activateState(const State &state) {
+ size_t cur_som = state.som;
+ if (setActive(state)) {
+ size_t cached_som = getCachedSom(state);
+ cur_som = min(cur_som, cached_som);
+ }
+ setCachedSom(state, cur_som);
+ }
+
+ vector<StateBitSet> shadows;
+ vector<StateBitSet> helpers;
+ vector<vector<size_t>> shadows_som;
+ vector<vector<size_t>> helpers_som;
+ u32 edit_distance;
};
-using MatchSet = set<pair<size_t, size_t>>;
+// for flat_set
+bool operator<(const StateSet::State &a, const StateSet::State &b) {
+ ORDER_CHECK(idx);
+ ORDER_CHECK(level);
+ ORDER_CHECK(type);
+ ORDER_CHECK(som);
+ return false;
+}
struct fmstate {
const size_t num_states; // number of vertices in graph
StateSet states; // currently active states
StateSet next; // states on after this iteration
+ GraphCache &gc;
vector<NFAVertex> vertices; // mapping from index to vertex
size_t offset = 0;
unsigned char cur = 0;
unsigned char prev = 0;
- const bool som;
const bool utf8;
const bool allowStartDs;
const ReportManager &rm;
- boost::dynamic_bitset<> accept; // states leading to accept
- boost::dynamic_bitset<> accept_with_eod; // states leading to accept or eod
-
- fmstate(const NGHolder &g, bool som_in, bool utf8_in, bool aSD_in,
- const ReportManager &rm_in)
- : num_states(num_vertices(g)), states(num_states), next(num_states),
- vertices(num_vertices(g), NGHolder::null_vertex()), som(som_in),
- utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in), accept(num_states),
- accept_with_eod(num_states) {
+ fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in,
+ const u32 edit_distance, const ReportManager &rm_in)
+ : num_states(num_vertices(g)),
+ states(num_states, edit_distance),
+ next(num_states, edit_distance),
+ gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()),
+ utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) {
// init states
- states.s.set(g[g.start].index);
+ states.activateState(
+ StateSet::State {g[g.start].index, 0, 0,
+ StateSet::State::NODE_SHADOW});
if (allowStartDs) {
- states.s.set(g[g.startDs].index);
+ states.activateState(
+ StateSet::State {g[g.startDs].index, 0, 0,
+ StateSet::State::NODE_SHADOW});
}
// fill vertex mapping
- for (const auto &v : vertices_range(g)) {
+ for (auto v : vertices_range(g)) {
vertices[g[v].index] = v;
}
- // init accept states
- for (const auto &u : inv_adjacent_vertices_range(g.accept, g)) {
- accept.set(g[u].index);
- }
- accept_with_eod = accept;
- for (const auto &u : inv_adjacent_vertices_range(g.acceptEod, g)) {
- accept_with_eod.set(g[u].index);
- }
}
};
}
static
-bool canReach(const NGHolder &g, const NFAEdge &e,
- struct fmstate &state) {
+bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) {
auto flags = g[e].assert_flags;
if (!flags) {
return true;
static
void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
bool allowEodMatches) {
- auto acc_states = state.states.s;
- acc_states &= allowEodMatches ? state.accept_with_eod : state.accept;
-
- for (size_t i = acc_states.find_first(); i != acc_states.npos;
- i = acc_states.find_next(i)) {
- const NFAVertex u = state.vertices[i];
- const size_t &som_offset = state.states.som[i];
+ flat_set<NFAVertex> accepts {g.accept, g.acceptEod};
- // we can't accept anything from startDs in between UTF-8 codepoints
- if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) {
+ for (auto v : accepts) {
+ bool eod = v == g.acceptEod;
+ if (eod && !allowEodMatches) {
continue;
}
- for (const auto &e : out_edges_range(u, g)) {
- NFAVertex v = target(e, g);
- if (v == g.accept || (v == g.acceptEod && allowEodMatches)) {
- // check edge assertions if we are allowed to reach accept
- if (!canReach(g, e, state)) {
- continue;
- }
- DEBUG_PRINTF("match found at %zu\n", state.offset);
+ auto active_states = eod ? state.states.getAcceptEodStates(state.gc) :
+ state.states.getAcceptStates(state.gc);
- assert(!g[u].reports.empty());
- for (const auto &report_id : g[u].reports) {
- const Report &ri = state.rm.getReport(report_id);
+ DEBUG_PRINTF("Number of active states: %zu\n", active_states.size());
- DEBUG_PRINTF("report %u has offset adjustment %d\n",
- report_id, ri.offsetAdjust);
- matches.emplace(som_offset, state.offset + ri.offsetAdjust);
- }
+ for (const auto &cur : active_states) {
+ auto u = state.vertices[cur.idx];
+
+ // we can't accept anything from startDs in between UTF-8 codepoints
+ if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) {
+ continue;
+ }
+
+ const auto &reports =
+ eod ?
+ state.gc.vertex_eod_reports_by_level[cur.level][u] :
+ state.gc.vertex_reports_by_level[cur.level][u];
+
+ NFAEdge e = edge(u, v, g);
+
+ // we assume edge assertions only exist at level 0
+ if (e && !canReach(g, e, state)) {
+ continue;
+ }
+
+ DEBUG_PRINTF("%smatch found at %zu\n",
+ eod ? "eod " : "", state.offset);
+
+ assert(!reports.empty());
+ for (const auto &report_id : reports) {
+ const Report &ri = state.rm.getReport(report_id);
+
+ DEBUG_PRINTF("report %u has offset adjustment %d\n",
+ report_id, ri.offsetAdjust);
+ DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n",
+ cur.idx, cur.level, cur.type, cur.som,
+ state.offset + ri.offsetAdjust);
+ matches.emplace(cur.som, state.offset + ri.offsetAdjust);
}
}
}
static
void step(const NGHolder &g, struct fmstate &state) {
- state.next.s.reset();
+ state.next.reset();
+
+ const auto active = state.states.getActiveStates();
+
+ for (const auto &cur : active) {
+ auto u = state.vertices[cur.idx];
+ auto succ_list = state.states.getSuccessors(cur, state.gc);
- for (size_t i = state.states.s.find_first(); i != state.states.s.npos;
- i = state.states.s.find_next(i)) {
- const NFAVertex &u = state.vertices[i];
- const size_t &u_som_offset = state.states.som[i];
+ for (auto succ : succ_list) {
+ auto v = state.vertices[succ.idx];
- for (const auto &e : out_edges_range(u, g)) {
- NFAVertex v = target(e, g);
- if (v == g.acceptEod) {
- // can't know the future: we don't know if we're at EOD.
+ if (is_any_accept(v, g)) {
continue;
}
- if (v == g.accept) {
+
+ if (!state.allowStartDs && v == g.startDs) {
continue;
}
- if (!state.allowStartDs && v == g.startDs) {
+ // GraphCache doesn't differentiate between successors for shadows
+ // and helpers, and StateSet does not know anything about the graph,
+ // so the only place we can do it is here. we can't self-loop on a
+ // startDs if we're startDs's helper, so disallow it.
+ if (u == g.startDs && v == g.startDs &&
+ succ.level != 0 && succ.level == cur.level) {
continue;
}
- const CharReach &cr = g[v].char_reach;
- const size_t v_idx = g[v].index;
+ // for the reasons outlined above, also putting this here.
+ // disallow transitions from start to startDs on levels other than zero
+ if (u == g.start && v == g.startDs &&
+ cur.level != 0 && succ.level != 0) {
+ continue;
+ }
- // check reachability and edge assertions
- if (cr.test(state.cur) && canReach(g, e, state)) {
- // if we aren't in SOM mode, just set every SOM to 0
- if (!state.som) {
- state.next.s.set(v_idx);
- state.next.som[v_idx] = 0;
- continue;
+ bool can_reach = false;
+
+ if (succ.type == StateSet::State::NODE_HELPER) {
+ can_reach = true;
+ } else {
+ // we assume edge assertions only exist on level 0
+ const CharReach &cr = g[v].char_reach;
+ NFAEdge e = edge(u, v, g);
+
+ if (cr.test(state.cur) &&
+ (!e || canReach(g, e, state))) {
+ can_reach = true;
}
+ }
- // if this is first vertex since start, use current offset as SOM
+ // check edge assertions if we are allowed to reach accept
+ DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n",
+ g[u].index, g[v].index,
+ ourisprint(state.prev) ? state.prev : '?',
+ ourisprint(state.cur) ? state.cur : '?',
+ can_reach ? "yes" : "no");
+
+ if (can_reach) {
+ // we should use current offset as SOM if:
+ // - we're at level 0 and we're a start vertex
+ // - we're a fake start shadow
size_t next_som;
- if (u == g.start || u == g.startDs || is_virtual_start(u, g)) {
+ bool reset = is_any_start(u, g) && cur.level == 0;
+ reset |= is_virtual_start(u, g) &&
+ cur.type == StateSet::State::NODE_SHADOW;
+
+ if (reset) {
next_som = state.offset;
} else {
// else, inherit SOM from predecessor
- next_som = u_som_offset;
+ next_som = cur.som;
}
+ succ.som = next_som;
- // check if the vertex is already active
- // if this vertex is not yet active, use current SOM
- if (!state.next.s.test(v_idx)) {
- state.next.s.set(v_idx);
- state.next.som[v_idx] = next_som;
- } else {
- // else, work out leftmost SOM
- state.next.som[v_idx] =
- min(next_som, state.next.som[v_idx]);
- }
+ DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n",
+ cur.idx, cur.level, cur.som,
+ cur.type == StateSet::State::NODE_HELPER ? "H" : "S");
+ DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n",
+ succ.idx, succ.level, succ.som,
+ succ.type == StateSet::State::NODE_HELPER ? "H" : "S");
+
+ // activate successor (SOM will be handled by activateState)
+ state.next.activateState(succ);
}
}
}
* Fills \a matches with offsets into the data stream where a match is found.
*/
void findMatches(const NGHolder &g, const ReportManager &rm,
- const string &input, MatchSet &matches, const bool notEod,
- const bool som, const bool utf8) {
+ const string &input, MatchSet &matches,
+ const u32 edit_distance, const bool notEod, const bool utf8) {
assert(hasCorrectlyNumberedVertices(g));
+ // cannot match fuzzy utf8 patterns, this should've been filtered out at
+ // compile time, so make it an assert
+ assert(!edit_distance || !utf8);
+
+ DEBUG_PRINTF("Finding matches\n");
+
+ GraphCache gc(edit_distance, g);
+#ifdef DEBUG
+ gc.dumpStateTransitionTable(g);
+#endif
const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
- struct fmstate state(g, som, utf8, allowStartDs, rm);
+ struct fmstate state(g, gc, utf8, allowStartDs, edit_distance, rm);
for (auto it = input.begin(), ite = input.end(); it != ite; ++it) {
+#ifdef DEBUG
+ state.states.dumpActiveStates();
+#endif
state.offset = distance(input.begin(), it);
state.cur = *it;
getMatches(g, matches, state, false);
- DEBUG_PRINTF("index %zu, %zu states on\n", state.offset,
- state.next.s.count());
- if (state.next.s.empty()) {
- if (state.som) {
- filterMatches(matches);
- }
+ DEBUG_PRINTF("offset %zu, %zu states on\n", state.offset,
+ state.next.count());
+ if (state.next.empty()) {
+ filterMatches(matches);
return;
}
state.states = state.next;
state.prev = state.cur;
}
+#ifdef DEBUG
+ state.states.dumpActiveStates();
+#endif
state.offset = input.size();
state.cur = 0;
// do additional step to get matches after stream end, this time count eod
// matches also (or not, if we're in notEod mode)
+ DEBUG_PRINTF("Looking for EOD matches\n");
getMatches(g, matches, state, !notEod);
- if (state.som) {
- filterMatches(matches);
- }
+ filterMatches(matches);
}