--- /dev/null
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ng_violet.h"
+
+#include "grey.h"
+#include "ng_depth.h"
+#include "ng_dominators.h"
+#include "ng_dump.h"
+#include "ng_equivalence.h"
+#include "ng_holder.h"
+#include "ng_is_equal.h"
+#include "ng_literal_analysis.h"
+#include "ng_netflow.h"
+#include "ng_prune.h"
+#include "ng_redundancy.h"
+#include "ng_region.h"
+#include "ng_reports.h"
+#include "ng_rose.h"
+#include "ng_split.h"
+#include "ng_util.h"
+#include "ng_width.h"
+#include "rose/rose_build.h"
+#include "rose/rose_build_util.h"
+#include "rose/rose_in_dump.h"
+#include "rose/rose_in_graph.h"
+#include "rose/rose_in_util.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+#include "util/ue2_containers.h"
+
+#include <set>
+#include <utility>
+#include <vector>
+#include <boost/core/noncopyable.hpp>
+#include <boost/graph/reverse_graph.hpp>
+#include <boost/graph/topological_sort.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+#define STAGE_DEBUG_PRINTF DEBUG_PRINTF
+
+using namespace std;
+using boost::adaptors::map_values;
+
+namespace ue2 {
+
+/* createsAnchoredLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+ const vector<NFAVertexDepth> &depths,
+ const Grey &grey, depth max_depth = depth::infinity()) {
+ max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
+
+ for (auto v : vv) {
+ /* avoid issues of self loops blowing out depths:
+ * look at preds, add 1 */
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == v) {
+ continue;
+ }
+
+ u32 idx = g[u].index;
+ assert(idx < depths.size());
+ if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/* createsTransientLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+ const vector<NFAVertexDepth> &depths,
+ const Grey &grey) {
+ const depth max_depth(grey.maxHistoryAvailable);
+
+ for (auto v : vv) {
+ /* avoid issues of self loops blowing out depths:
+ * look at preds, add 1 */
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == v) {
+ continue;
+ }
+
+ u32 idx = g[u].index;
+ assert(idx < depths.size());
+ if (maxDistFromInit(depths.at(idx)) >= max_depth) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+namespace {
+/**
+ * Information on a cut: vertices and literals.
+ */
+struct VertLitInfo {
+ VertLitInfo() {}
+ VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch,
+ bool c_tran = false)
+ : vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch),
+ creates_transient(c_tran) {}
+ VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in,
+ bool c_anch)
+ : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {}
+ vector<NFAVertex> vv;
+ set<ue2_literal> lit;
+
+ bool creates_anchored = false;
+ bool creates_transient = false;
+};
+
+/**
+ * \brief Comparator class for sorting LitCollection::lits.
+ *
+ * This is separated out from LitCollection itself as passing LitCollection to
+ * std::sort() would incur a (potentially expensive) copy.
+ */
+class LitComparator {
+public:
+ LitComparator(const NGHolder &g_in, bool sa, bool st)
+ : g(g_in), seeking_anchored(sa), seeking_transient(st) {}
+ bool operator()(const unique_ptr<VertLitInfo> &a,
+ const unique_ptr<VertLitInfo> &b) const {
+ assert(a && b);
+
+ if (seeking_anchored) {
+ if (a->creates_anchored != b->creates_anchored) {
+ return a->creates_anchored < b->creates_anchored;
+ }
+ }
+
+ if (seeking_transient) {
+ if (a->creates_transient != b->creates_transient) {
+ return a->creates_transient < b->creates_transient;
+ }
+ }
+
+ u64a score_a = scoreSet(a->lit);
+ u64a score_b = scoreSet(b->lit);
+
+ if (score_a != score_b) {
+ return score_a > score_b;
+ }
+
+ /* vertices should only be in one candidate cut */
+ assert(a->vv == b->vv || a->vv.front() != b->vv.front());
+ return g[a->vv.front()].index > g[b->vv.front()].index;
+ }
+
+private:
+ const NGHolder &g; /**< graph on which cuts are found */
+
+ bool seeking_anchored;
+ bool seeking_transient;
+};
+}
+
+static
+size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
+ size_t count = 0;
+
+ for (const auto &lit : s) {
+ if (lit.length() < limit) {
+ count++;
+ }
+ }
+
+ return count;
+}
+
+static
+u32 min_len(const set<ue2_literal> &s) {
+ u32 rv = ~0U;
+
+ for (const auto &lit : s) {
+ rv = min(rv, (u32)lit.length());
+ }
+
+ return rv;
+}
+
+static
+u32 min_period(const set<ue2_literal> &s) {
+ u32 rv = ~0U;
+
+ for (const auto &lit : s) {
+ rv = min(rv, (u32)minStringPeriod(lit));
+ }
+ DEBUG_PRINTF("min period %u\n", rv);
+ return rv;
+}
+
+#define MIN_ANCHORED_LEN 2
+
+static
+bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
+ bool anchored, u32 min_allowed_floating_len,
+ bool desperation) {
+ u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
+ : min_allowed_floating_len;
+
+ assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
+
+ if (score >= NO_LITERAL_AT_EDGE_SCORE) {
+ DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
+ return false;
+ }
+
+ assert(!s.empty());
+ if (s.empty()) {
+ DEBUG_PRINTF("candidate is too bad/something went wrong\n");
+ return false;
+ }
+
+ u32 s_min_len = min_len(s);
+ u32 s_min_period = min_period(s);
+ size_t short_count = shorter_than(s, 5);
+
+ DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
+ " short_count=%zu desp=%d\n",
+ dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
+ s_min_period, short_count, (int)desperation);
+
+ bool ok = true;
+
+ if (s.size() > 10 /* magic number is magic */
+ || s_min_len < min_allowed_len
+ || (s_min_period <= 1 && min_allowed_len != 1)) {
+ ok = false;
+ }
+
+ if (!ok && desperation
+ && s.size() <= 20 /* more magic numbers are magical */
+ && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
+ && s_min_period > 1) {
+ DEBUG_PRINTF("candidate is ok\n");
+ ok = true;
+ }
+
+ if (!ok && desperation
+ && s.size() <= 50 /* more magic numbers are magical */
+ && s_min_len > 10
+ && s_min_period > 1) {
+ DEBUG_PRINTF("candidate is ok\n");
+ ok = true;
+ }
+
+ if (!ok) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ return false;
+ }
+
+ return true;
+}
+
+static UNUSED
+void dumpRoseLiteralSet(const set<ue2_literal> &s) {
+ for (UNUSED const auto &lit : s) {
+ DEBUG_PRINTF(" lit: %s\n", dumpString(lit).c_str());
+ }
+}
+
+static
+void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
+ const vector<NFAVertexDepth> *depths,
+ const set<NFAVertex> &a_dom,
+ vector<unique_ptr<VertLitInfo>> *lits,
+ u32 min_allowed_len, bool desperation,
+ const CompileContext &cc) {
+ assert(depths || !seeking_anchored);
+
+ map<NFAVertex, u64a> scores;
+ map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
+ set<ue2_literal> s;
+
+ for (auto v : a_dom) {
+ s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
+ revisits to the target vertex */
+
+ if (s.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ continue;
+ }
+
+ DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+ dumpRoseLiteralSet(s);
+ u64a score = sanitizeAndCompressAndScore(s);
+
+ bool anchored = false;
+ if (seeking_anchored) {
+ anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey);
+ }
+
+ if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+ desperation)) {
+ continue;
+ }
+
+ DEBUG_PRINTF("candidate is a candidate\n");
+ scores[v] = score;
+ lit_info[v] = make_unique<VertLitInfo>(v, s, anchored);
+ }
+
+ /* try to filter out cases where appending some characters produces worse
+ * literals. Only bother to look back one byte, TODO make better */
+ for (auto u : a_dom) {
+ if (out_degree(u, g) != 1 || !scores[u]) {
+ continue;
+ }
+ NFAVertex v = *adjacent_vertices(u, g).first;
+ if (contains(scores, v) && scores[v] >= scores[u]) {
+ DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
+ scores[v], scores[u]);
+ lit_info.erase(v);
+ }
+ }
+
+ lits->reserve(lit_info.size());
+ for (auto &m : lit_info) {
+ lits->push_back(move(m.second));
+ }
+ DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
+}
+
+static
+void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
+ const vector<NFAVertexDepth> *depths,
+ const set<NFAVertex> &bad,
+ const set<NFAVertex> *allowed,
+ vector<unique_ptr<VertLitInfo>> *lits,
+ u32 min_allowed_len, bool desperation,
+ const CompileContext &cc) {
+ /* This allows us to get more places to split the graph as we are not
+ limited to points where there is a single vertex to split at. */
+
+ assert(depths || !seeking_anchored);
+
+ /* TODO: operate over 'proto-regions' which ignore back edges */
+ auto regions = assignRegions(g);
+
+ set<u32> mand, optional;
+ map<u32, vector<NFAVertex> > exits;
+
+ for (auto v : vertices_range(g)) {
+ u32 region = regions[v];
+ if (is_any_start(v, g) || region == 0) {
+ continue;
+ }
+
+ if (is_any_accept(v, g)) {
+ continue;
+ }
+
+ if (!generates_callbacks(g) && is_match_vertex(v, g)) {
+ /* we cannot leave a completely vacuous infix */
+ continue;
+ }
+
+ if (isRegionExit(g, v, regions)) {
+ exits[region].push_back(v);
+ }
+
+ if (isRegionEntry(g, v, regions)) {
+ // Determine whether this region is mandatory or optional. We only
+ // need to do this check for the first entry vertex we encounter
+ // for this region.
+ if (!contains(mand, region) && !contains(optional, region)) {
+ if (isOptionalRegion(g, v, regions)) {
+ optional.insert(region);
+ } else {
+ mand.insert(region);
+ }
+ }
+ }
+ }
+
+ for (const auto &m : exits) {
+ if (false) {
+ next_cand:
+ continue;
+ }
+
+ const u32 region = m.first;
+ const vector<NFAVertex> &vv = m.second;
+ assert(!vv.empty());
+
+ if (!contains(mand, region)) {
+ continue;
+ }
+
+ for (auto v : vv) {
+ /* if an exit is in bad, the region is already handled well
+ * by getSimpleRoseLiterals or is otherwise bad */
+ if (contains(bad, v)) {
+ goto next_cand;
+ }
+ /* if we are only allowed to consider some vertices, v must be in
+ the list; */
+ if (allowed && !contains(*allowed, v)) {
+ goto next_cand;
+ }
+ }
+
+ /* the final region may not have a neat exit. validate that all exits
+ * have an edge to each accept or none do */
+ bool edge_to_a = edge(vv[0], g.accept, g).second;
+ bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
+ const auto &reports = g[vv[0]].reports;
+ for (auto v : vv) {
+ if (edge_to_a != edge(v, g.accept, g).second) {
+ goto next_cand;
+ }
+
+ if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
+ goto next_cand;
+ }
+
+ if (g[v].reports != reports) {
+ goto next_cand;
+ }
+ }
+
+ DEBUG_PRINTF("inspecting region %u\n", region);
+ set<ue2_literal> s;
+ for (auto v : vv) {
+ DEBUG_PRINTF(" exit vertex: %u\n", g[v].index);
+ /* Note: RHS can not be depended on to take all subsequent revisits
+ * to this vertex */
+ set<ue2_literal> ss = getLiteralSet(g, v, false);
+ if (ss.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ goto next_cand;
+ }
+ insert(&s, ss);
+ }
+
+ assert(!s.empty());
+
+ DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+ dumpRoseLiteralSet(s);
+ u64a score = sanitizeAndCompressAndScore(s);
+
+ DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
+ dumpRoseLiteralSet(s);
+
+ bool anchored = false;
+ if (seeking_anchored) {
+ anchored = createsAnchoredLHS(g, vv, *depths, cc.grey);
+ }
+
+ if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+ desperation)) {
+ goto next_cand;
+ }
+
+ DEBUG_PRINTF("candidate is a candidate\n");
+ lits->push_back(make_unique<VertLitInfo>(vv, s, anchored));
+ }
+}
+
+static
+void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
+ set<NFAVertex> *out) {
+ for (auto u : cand_raw) {
+ const CharReach &u_cr = g[u].char_reach;
+ if (u_cr.count() > 40) {
+ continue; /* too wide to be plausible */
+ }
+
+ if (u_cr.count() > 2) {
+ /* include u as a candidate as successor may have backed away from
+ * expanding through it */
+ out->insert(u);
+ continue;
+ }
+
+ NFAVertex v = getSoleDestVertex(g, u);
+ if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
+ const CharReach &v_cr = g[v].char_reach;
+ if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+ continue; /* v will always generate better literals */
+ }
+ }
+
+ out->insert(u);
+ }
+}
+
+/* cand_raw is the candidate set before filtering points which are clearly
+ * a bad idea. */
+static
+void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
+ set<NFAVertex> *cand_raw) {
+ ue2::unordered_map<NFAVertex, NFAVertex> dominators = findDominators(g);
+
+ set<NFAVertex> accepts;
+
+ for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+ if (is_special(v, g)) {
+ continue;
+ }
+ accepts.insert(v);
+ }
+ for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+ if (is_special(v, g)) {
+ continue;
+ }
+ accepts.insert(v);
+ }
+
+ assert(!accepts.empty());
+
+ vector<NFAVertex> dom_trace;
+ auto ait = accepts.begin();
+ assert(ait != accepts.end());
+ NFAVertex curr = *ait;
+ while (curr && !is_special(curr, g)) {
+ dom_trace.push_back(curr);
+ curr = dominators[curr];
+ }
+ reverse(dom_trace.begin(), dom_trace.end());
+ for (++ait; ait != accepts.end(); ++ait) {
+ curr = *ait;
+ vector<NFAVertex> dom_trace2;
+ while (curr && !is_special(curr, g)) {
+ dom_trace2.push_back(curr);
+ curr = dominators[curr];
+ }
+ reverse(dom_trace2.begin(), dom_trace2.end());
+ auto dti = dom_trace.begin(), dtie = dom_trace.end();
+ auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
+ while (dti != dtie && dtj != dtje && *dti == *dtj) {
+ ++dti;
+ ++dtj;
+ }
+ dom_trace.erase(dti, dtie);
+ }
+
+ cand_raw->insert(dom_trace.begin(), dom_trace.end());
+
+ filterCandPivots(g, *cand_raw, cand);
+}
+
+static
+unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
+ const vector<NFAVertexDepth> *depths,
+ bool for_prefix, u32 min_len,
+ const set<NFAVertex> *allowed_cand,
+ const set<NFAVertex> *disallowed_cand,
+ const CompileContext &cc) {
+ assert(!for_prefix || depths);
+
+ /* look for a single simple split point */
+ set<NFAVertex> cand;
+ set<NFAVertex> cand_raw;
+
+ getCandidatePivots(g, &cand, &cand_raw);
+
+ if (allowed_cand) {
+ set<NFAVertex> cand2;
+ set<NFAVertex> cand2_raw;
+ set_intersection(allowed_cand->begin(), allowed_cand->end(),
+ cand.begin(), cand.end(),
+ inserter(cand2, cand2.begin()));
+
+ set_intersection(allowed_cand->begin(), allowed_cand->end(),
+ cand_raw.begin(), cand_raw.end(),
+ inserter(cand2_raw, cand2_raw.begin()));
+
+ cand = std::move(cand2);
+ cand_raw = std::move(cand2_raw);
+ }
+ if (disallowed_cand) {
+ DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size());
+ DEBUG_PRINTF("|old cand| = %zu\n", cand.size());
+ erase_all(&cand, *disallowed_cand);
+ insert(&cand_raw, *disallowed_cand);
+ }
+
+ if (!generates_callbacks(g)) {
+ /* not output exposed so must leave some RHS */
+ for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
+ cand.erase(v);
+ cand_raw.erase(v);
+ }
+
+ for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+ cand.erase(v);
+ cand_raw.erase(v);
+ }
+ }
+
+ DEBUG_PRINTF("|cand| = %zu\n", cand.size());
+
+ bool seeking_anchored = for_prefix;
+ bool seeking_transient = for_prefix; //cc.streaming;
+
+ /* TODO: revisit when backstop goes away */
+ bool desperation = for_prefix && cc.streaming;
+
+ vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
+
+ getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
+ desperation, cc);
+ getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
+ &lits, min_len, desperation, cc);
+
+ if (lits.empty()) {
+ DEBUG_PRINTF("no literals found\n");
+ return nullptr;
+ }
+
+ if (seeking_transient) {
+ for (auto &a : lits) {
+ a->creates_transient
+ = createsTransientLHS(g, a->vv, *depths, cc.grey);
+ }
+ }
+
+ auto cmp = LitComparator(g, seeking_anchored, seeking_transient);
+
+ unique_ptr<VertLitInfo> best = move(lits.back());
+ lits.pop_back();
+ while (!lits.empty()) {
+ if (cmp(best, lits.back())) {
+ best = move(lits.back());
+ }
+ lits.pop_back();
+ }
+
+ DEBUG_PRINTF("best is '%s' %u a%d t%d\n",
+ ((const string &)*best->lit.begin()).c_str(),
+ g[best->vv.front()].index,
+ depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
+ depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
+
+ return best;
+}
+
+static
+void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
+ bool overhang_ok, flat_set<NFAEdge> &bad) {
+ DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
+ num_vertices(h), succ.length());
+
+ map<NFAVertex, flat_set<NFAEdge> > curr;
+ for (const auto &e : in_edges_range(h.accept, h)) {
+ curr[source(e, h)].insert(e);
+ }
+
+ map<NFAVertex, flat_set<NFAEdge> > next;
+ for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
+ for (const auto &path : curr) {
+ NFAVertex u = path.first;
+ const auto &path_set = path.second;
+ if (u == h.start && overhang_ok) {
+ DEBUG_PRINTF("poisoning early %zu [overhang]\n",
+ path_set.size());
+ insert(&bad, path_set);
+ continue;
+ }
+ if (overlaps(h[u].char_reach, *it)) {
+ for (const auto &e : in_edges_range(u, h)) {
+ auto &new_path_set = next[source(e, h)];
+ insert(&new_path_set, path_set);
+ new_path_set.insert(e);
+ }
+ }
+ }
+ DEBUG_PRINTF("succ char matches at %zu paths\n", next.size());
+ assert(overhang_ok || !curr.empty());
+ swap(curr, next);
+ next.clear();
+ }
+
+ assert(overhang_ok || !curr.empty());
+ for (const auto &path : curr) {
+ insert(&bad, path.second);
+ DEBUG_PRINTF("poisoning %zu vertices\n", path.second.size());
+ }
+}
+
+static
+void poisonForGoodPrefix(const NGHolder &h,
+ const vector<NFAVertexDepth> &depths,
+ flat_set<NFAEdge> &bad, const Grey &grey) {
+ for (const auto &v : vertices_range(h)) {
+ if (!createsAnchoredLHS(h, {v}, depths, grey)
+ && !createsTransientLHS(h, {v}, depths, grey)) {
+ insert(&bad, in_edges_range(v, h));
+ }
+ }
+}
+
+static
+flat_set<NFAEdge> poisonEdges(const NGHolder &h,
+ const vector<NFAVertexDepth> *depths,
+ const RoseInGraph &vg, const vector<RoseInEdge> &ee,
+ bool for_prefix, const Grey &grey) {
+ DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size());
+
+ /* poison edges covered by successor literal */
+
+ set<pair<ue2_literal, bool> > succs;
+ for (const RoseInEdge &ve : ee) {
+ if (vg[target(ve, vg)].type != RIV_LITERAL) {
+ /* nothing to poison in suffixes/outfixes */
+ assert(vg[target(ve, vg)].type == RIV_ACCEPT);
+ continue;
+ }
+ succs.insert({vg[target(ve, vg)].s,
+ vg[source(ve, vg)].type == RIV_LITERAL});
+
+ }
+
+ DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size());
+
+ flat_set<NFAEdge> bad;
+ for (const auto &p : succs) {
+ poisonFromSuccessor(h, p.first, p.second, bad);
+ }
+
+ /* poison edges which don't significantly improve a prefix */
+
+ if (for_prefix) {
+ poisonForGoodPrefix(h, *depths, bad, grey);
+ }
+
+ return bad;
+}
+
+static
+set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, const Grey &grey) {
+ flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey);
+ set<NFAVertex> bad_vertices;
+ for (const NFAEdge &e : bad_edges) {
+ bad_vertices.insert(target(e, h));
+ DEBUG_PRINTF("bad: %u->%u\n", h[source(e, h)].index,
+ h[target(e, h)].index);
+ }
+
+ return bad_vertices;
+}
+
+static
+unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
+ const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
+ set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+
+ return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
+ nullptr, &bad_vertices, cc);
+}
+
+static
+unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("looking for simple prefix split\n");
+ bool anchored = !proper_out_degree(g.startDs, g);
+ NFAVertex u = anchored ? g.start : g.startDs;
+
+ if (out_degree(u, g) != 2) { /* startDs + succ */
+ return nullptr;
+ }
+
+ NFAVertex v = NGHolder::null_vertex();
+ for (NFAVertex t : adjacent_vertices_range(u, g)) {
+ if (t != g.startDs) {
+ assert(!v);
+ v = t;
+ }
+ }
+ assert(v);
+
+ if (!anchored) {
+ if (out_degree(g.start, g) > 2) {
+ return nullptr;
+ }
+ if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) {
+ return nullptr;
+ }
+ }
+
+ NFAVertex best_v = NGHolder::null_vertex();
+ ue2_literal best_lit;
+
+ u32 limit = cc.grey.maxHistoryAvailable;
+ if (anchored) {
+ LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion);
+ }
+
+ ue2_literal curr_lit;
+ for (u32 i = 0; i < limit; i++) {
+ const auto &v_cr = g[v].char_reach;
+ if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+ curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar());
+ } else {
+ curr_lit.clear();
+ }
+
+ if (curr_lit.length() > best_lit.length()) {
+ best_lit = curr_lit;
+ best_v = v;
+ }
+
+ if (out_degree(v, g) != 1) {
+ break;
+ }
+ v = *adjacent_vertices(v, g).first;
+ }
+
+ if (best_lit.length() < cc.grey.minRoseLiteralLength) {
+ return nullptr;
+ }
+
+ set<ue2_literal> best_lit_set({best_lit});
+ if (bad_mixed_sensitivity(best_lit)) {
+ sanitizeAndCompressAndScore(best_lit_set);
+ }
+
+ return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
+}
+
+static
+unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
+ const vector<NFAVertexDepth> &depths,
+ const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ assert(g.kind == NFA_PREFIX);
+ set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+ auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
+ nullptr, &bad_vertices, cc);
+
+ /* large back edges may prevent us identifing anchored or transient cases
+ * properly - use a simple walk instead */
+ if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
+ auto rv2 = findSimplePrefixSplit(g, cc);
+ if (rv2) {
+ return rv2;
+ }
+ }
+
+ return rv;
+}
+
+static
+unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
+ const CompileContext &cc) {
+ assert(g.kind != NFA_PREFIX);
+ set<NFAVertex> cleanSplits;
+ for (NFAVertex v : vertices_range(g)) {
+ if (!g[v].char_reach.all() || !edge(v, v, g).second) {
+ continue;
+ }
+ insert(&cleanSplits, inv_adjacent_vertices(v, g));
+ cleanSplits.erase(v);
+ }
+ cleanSplits.erase(g.start);
+ if (cleanSplits.empty()) {
+ return nullptr;
+ }
+ return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
+ &cleanSplits, nullptr, cc);
+}
+
+static
+bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
+ set<NFAVertex> curr, next;
+ curr.insert(g.accept);
+
+ for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
+ next.clear();
+
+ for (auto v : curr) {
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == g.start) {
+ if (overhang_ok) {
+ DEBUG_PRINTF("bail\n");
+ return true;
+ } else {
+ continue; /* it is not possible for a lhs literal to
+ * overhang the start */
+ }
+ }
+
+ const CharReach &cr = g[u].char_reach;
+ if (!overlaps(*it, cr)) {
+ continue;
+ }
+
+ next.insert(u);
+ }
+ }
+
+ curr.swap(next);
+ }
+
+ return !curr.empty();
+}
+
+static
+bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, const VertLitInfo &split) {
+ const vector<NFAVertex> &splitters = split.vv;
+ assert(!splitters.empty());
+
+ shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+ shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
+
+ ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
+ ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
+
+ splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
+ DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
+ to_string(base_graph.kind).c_str(), num_vertices(base_graph),
+ to_string(lhs->kind).c_str(), num_vertices(*lhs),
+ to_string(rhs->kind).c_str(), num_vertices(*rhs));
+
+ bool suffix = vg[target(ee.front(), vg)].type == RIV_ACCEPT;
+
+ if (is_triggered(base_graph)) {
+ /* if we are already guarded, check if the split reduces the size of
+ * the problem before continuing with the split */
+ if (num_vertices(*lhs) >= num_vertices(base_graph)
+ && !(suffix && isVacuous(*rhs))) {
+ DEBUG_PRINTF("split's lhs is no smaller\n");
+ return false;
+ }
+
+ if (num_vertices(*rhs) >= num_vertices(base_graph)) {
+ DEBUG_PRINTF("split's rhs is no smaller\n");
+ return false;
+ }
+ }
+
+ bool do_accept = false;
+ bool do_accept_eod = false;
+ assert(rhs);
+ if (isVacuous(*rhs) && suffix) {
+ if (edge(rhs->start, rhs->accept, *rhs).second) {
+ DEBUG_PRINTF("rhs has a cliche\n");
+ do_accept = true;
+ remove_edge(rhs->start, rhs->accept, *rhs);
+ }
+
+ if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
+ DEBUG_PRINTF("rhs has an eod cliche\n");
+ do_accept_eod = true;
+ remove_edge(rhs->start, rhs->acceptEod, *rhs);
+ }
+
+ renumber_edges(*rhs);
+ }
+
+ /* check if we still have a useful graph left over */
+ bool do_norm = out_degree(rhs->start, *rhs) != 1;
+
+ set<ReportID> splitter_reports;
+ for (auto v : splitters) {
+ insert(&splitter_reports, base_graph[v].reports);
+ }
+
+ /* find the targets of each source vertex */
+ map<RoseInVertex, flat_set<RoseInVertex> > images;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex src = source(e, vg);
+ RoseInVertex dest = target(e, vg);
+ images[src].insert(dest);
+ remove_edge(e, vg);
+ }
+
+ map<flat_set<RoseInVertex>, vector<RoseInVertex> > verts_by_image;
+
+ for (const auto &elem : images) {
+ RoseInVertex u = elem.first;
+ const auto &image = elem.second;
+
+ if (contains(verts_by_image, image)) {
+ for (RoseInVertex v : verts_by_image[image]) {
+ add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+ }
+ continue;
+ }
+
+ for (const auto &lit : split.lit) {
+ assert(!bad_mixed_sensitivity(lit));
+
+ /* don't allow overhang in can_match() as literals should
+ * correspond to the edge graph being split; overhanging the graph
+ * would indicate a false path.*/
+ if (!can_match(*lhs, lit, false)) {
+ DEBUG_PRINTF("'%s' did not match lhs\n",
+ escapeString(lit).c_str());
+ continue;
+ }
+
+ DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+ auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+ add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+
+ /* work out delay later */
+ if (do_accept) {
+ DEBUG_PRINTF("rhs has a cliche\n");
+ auto tt = add_vertex(RoseInVertexProps::makeAccept(
+ splitter_reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+
+ if (do_accept_eod) {
+ DEBUG_PRINTF("rhs has an eod cliche\n");
+ auto tt = add_vertex(RoseInVertexProps::makeAcceptEod(
+ splitter_reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+
+ if (do_norm) {
+ assert(out_degree(rhs->start, *rhs) > 1);
+ for (RoseInVertex dest : image) {
+ add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
+ }
+ }
+ verts_by_image[image].push_back(v);
+ }
+ }
+
+ assert(hasCorrectlyNumberedVertices(*rhs));
+ assert(hasCorrectlyNumberedEdges(*rhs));
+ assert(hasCorrectlyNumberedVertices(*lhs));
+ assert(hasCorrectlyNumberedEdges(*lhs));
+
+ return true;
+}
+
+#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
+#define MAX_LEN_2_LITERALS_PER_CUT 3
+
+static
+bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
+ const map<NFAEdge, set<ue2_literal>> &cut_lits,
+ u32 min_allowed_length) {
+ DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
+ min_allowed_length);
+ if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
+ return false;
+ }
+
+ u32 len_2_count = 0;
+
+ for (const auto &cut : cut_lits) {
+ if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
+ DEBUG_PRINTF("cut uses a forbidden edge\n");
+ return false;
+ }
+
+ if (min_len(cut.second) < min_allowed_length) {
+ DEBUG_PRINTF("cut uses a bad literal\n");
+ return false;
+ }
+
+ for (const auto &lit : cut.second) {
+ if (lit.length() == 2) {
+ len_2_count++;
+ }
+ }
+ }
+
+ if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
+ return false;
+ }
+
+ return true;
+}
+
+static
+void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &to_cut,
+ const vector<NFAEdge> &cut,
+ const map<NFAEdge, set<ue2_literal> > &cut_lits) {
+ set<RoseInVertex> sources;
+ for (const RoseInEdge &ve : to_cut) {
+ assert(&h == &*vg[ve].graph);
+ sources.insert(source(ve, vg));
+ }
+
+ DEBUG_PRINTF("splitting %s:\n", to_string(h.kind).c_str());
+
+ /* create literal vertices and connect preds */
+ map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex> > > verts_by_source;
+ for (RoseInVertex src : sources) {
+ /* iterate over cut for determinism */
+ for (const auto &e : cut) {
+ NFAVertex prev_v = source(e, h);
+ NFAVertex pivot = target(e, h);
+
+ DEBUG_PRINTF("splitting on pivot %u\n", h[pivot].index);
+ ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+ shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
+ splitLHS(h, pivot, new_lhs.get(), &temp_map);
+
+ /* want to cut off paths to pivot from things other than the pivot -
+ * makes a more svelte graphy */
+ clear_in_edges(temp_map[pivot], *new_lhs);
+ add_edge(temp_map[prev_v], temp_map[pivot], *new_lhs);
+
+ pruneUseless(*new_lhs, false);
+ renumber_vertices(*new_lhs);
+ renumber_edges(*new_lhs);
+
+ DEBUG_PRINTF(" into lhs %s\n", to_string(new_lhs->kind).c_str());
+
+ assert(hasCorrectlyNumberedVertices(*new_lhs));
+ assert(hasCorrectlyNumberedEdges(*new_lhs));
+
+ const set<ue2_literal> &lits = cut_lits.at(e);
+ for (const auto &lit : lits) {
+ if (!can_match(*new_lhs, lit, is_triggered(h))) {
+ continue;
+ }
+
+ RoseInVertex v
+ = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+ /* if this is a prefix/infix an edge directly to accept should
+ * represent a false path as we have poisoned vertices covered
+ * by the literals. */
+ if (generates_callbacks(h)) {
+ if (edge(pivot, h.accept, h).second) {
+ DEBUG_PRINTF("adding acceptEod\n");
+ /* literal has a direct connection to accept */
+ const flat_set<ReportID> &reports = h[pivot].reports;
+ auto tt = add_vertex(
+ RoseInVertexProps::makeAccept(reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+
+ if (edge(pivot, h.acceptEod, h).second) {
+ assert(generates_callbacks(h));
+ DEBUG_PRINTF("adding acceptEod\n");
+ /* literal has a direct connection to accept */
+ const flat_set<ReportID> &reports = h[pivot].reports;
+ auto tt = add_vertex(
+ RoseInVertexProps::makeAcceptEod(reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+ }
+
+ add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg);
+ verts_by_source[src].push_back({v, pivot});
+ }
+ }
+ }
+
+ /* wire the literal vertices up to successors */
+ map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
+ for (const RoseInEdge &ve : to_cut) {
+ RoseInVertex src = source(ve, vg);
+ RoseInVertex dest = target(ve, vg);
+
+ /* iterate over cut for determinism */
+ for (const auto &elem : verts_by_source[src]) {
+ NFAVertex pivot = elem.second;
+ RoseInVertex v = elem.first;
+
+ vector<NFAVertex> adj;
+ insert(&adj, adj.end(), adjacent_vertices(pivot, h));
+ /* we can ignore presence of accept, accepteod in adj as it is best
+ effort */
+
+ if (!contains(done_rhs, adj)) {
+ ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
+ shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
+ splitRHS(h, adj, new_rhs.get(), &temp_map);
+ remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
+ remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
+ renumber_edges(*new_rhs);
+ DEBUG_PRINTF(" into rhs %s\n",
+ to_string(new_rhs->kind).c_str());
+ done_rhs.emplace(adj, new_rhs);
+ }
+
+ assert(done_rhs[adj].get());
+ shared_ptr<NGHolder> new_rhs = done_rhs[adj];
+
+ assert(hasCorrectlyNumberedVertices(*new_rhs));
+ assert(hasCorrectlyNumberedEdges(*new_rhs));
+
+ if (vg[dest].type == RIV_LITERAL
+ && !can_match(*new_rhs, vg[dest].s, true)) {
+ continue;
+ }
+
+ if (out_degree(new_rhs->start, *new_rhs) != 1) {
+ add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg);
+ }
+ }
+
+ remove_edge(ve, vg);
+ }
+}
+
+static
+bool doNetflowCut(NGHolder &h,
+ const vector<NFAVertexDepth> *depths,
+ RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, bool for_prefix,
+ const Grey &grey, u32 min_allowed_length = 0U) {
+ ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength);
+
+ DEBUG_PRINTF("doing netflow cut\n");
+ /* TODO: we should really get literals/scores from the full graph as this
+ * allows us to overlap with previous cuts. */
+ assert(!ee.empty());
+ assert(&h == &*vg[ee.front()].graph);
+ assert(!for_prefix || depths);
+
+ if (num_edges(h) > grey.maxRoseNetflowEdges) {
+ /* We have a limit on this because scoring edges and running netflow
+ * gets very slow for big graphs. */
+ DEBUG_PRINTF("too many edges, skipping netflow cut\n");
+ return false;
+ }
+
+ assert(hasCorrectlyNumberedVertices(h));
+ assert(hasCorrectlyNumberedEdges(h));
+
+ auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey);
+
+ /* Step 1: Get scores for all edges */
+ vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */
+
+ /* Step 2: Find cutset based on scores */
+ vector<NFAEdge> cut = findMinCut(h, scores);
+
+ /* Step 3: Get literals corresponding to cut edges */
+ map<NFAEdge, set<ue2_literal>> cut_lits;
+ for (const auto &e : cut) {
+ set<ue2_literal> lits = getLiteralSet(h, e);
+ sanitizeAndCompressAndScore(lits);
+
+ cut_lits[e] = lits;
+
+ DEBUG_PRINTF("cut lit '%s' %u->%u\n",
+ ((const string &)*cut_lits[e].begin()).c_str(),
+ h[source(e, h)].index, h[target(e, h)].index);
+ }
+
+ /* if literals are underlength bail or if it involves a forbidden edge*/
+ if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) {
+ return false;
+ }
+ DEBUG_PRINTF("splitting\n");
+
+ /* Step 4: Split graph based on cuts */
+ splitEdgesByCut(h, vg, ee, cut, cut_lits);
+
+ return true;
+}
+
+static
+bool deanchorIfNeeded(NGHolder &g) {
+ DEBUG_PRINTF("hi\n");
+ if (proper_out_degree(g.startDs, g)) {
+ return false;
+ }
+
+ /* look for a non-special dot with a loop following start */
+ set<NFAVertex> succ_g;
+ insert(&succ_g, adjacent_vertices(g.start, g));
+ succ_g.erase(g.startDs);
+
+ for (auto v : adjacent_vertices_range(g.start, g)) {
+ DEBUG_PRINTF("inspecting cand %u || = %zu\n", g[v].index,
+ g[v].char_reach.count());
+
+ if (v == g.startDs || !g[v].char_reach.all()) {
+ continue;
+ }
+
+ set<NFAVertex> succ_v;
+ insert(&succ_v, adjacent_vertices(v, g));
+
+ if (succ_v == succ_g) {
+ DEBUG_PRINTF("found ^.*\n");
+ for (auto succ : succ_g) {
+ add_edge(g.startDs, succ, g);
+ }
+ clear_vertex(v, g);
+ remove_vertex(v, g);
+ renumber_vertices(g);
+ return true;
+ }
+
+ if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
+ DEBUG_PRINTF("found ^.+\n");
+ add_edge(g.startDs, v, g);
+ remove_edge(v, v, g);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static
+RoseInGraph populateTrivialGraph(const NGHolder &h) {
+ RoseInGraph g;
+ shared_ptr<NGHolder> root_g = cloneHolder(h);
+ bool orig_anch = isAnchored(*root_g);
+ orig_anch |= deanchorIfNeeded(*root_g);
+
+ DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
+
+ auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g);
+ auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g);
+
+ add_edge(start, accept, RoseInEdgeProps(root_g, 0), g);
+
+ return g;
+}
+
+static
+void avoidOutfixes(RoseInGraph &vg, const CompileContext &cc) {
+ STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
+ if (num_vertices(vg) > 2) {
+ /* must be at least one literal aside from start and accept */
+ return;
+ }
+
+ RoseInEdge e = *edges(vg).first;
+
+ NGHolder &h = *vg[e].graph;
+
+ renumber_vertices(h);
+ renumber_edges(h);
+
+ unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
+
+ if (split && splitRoseEdge(h, vg, {e}, *split)) {
+ DEBUG_PRINTF("split on simple literal\n");
+ } else {
+ doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
+ }
+}
+
+static
+void removeRedundantPrefixes(RoseInGraph &g) {
+ STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n");
+
+ for (const RoseInEdge &e : edges_range(g)) {
+ RoseInVertex s = source(e, g);
+ RoseInVertex t = target(e, g);
+
+ if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (!g[e].graph) {
+ continue;
+ }
+
+ assert(!g[t].delay);
+ const ue2_literal &lit = g[t].s;
+
+ if (!literalIsWholeGraph(*g[e].graph, lit)) {
+ DEBUG_PRINTF("not whole graph\n");
+ continue;
+ }
+
+ if (!isFloating(*g[e].graph)) {
+ DEBUG_PRINTF("not floating\n");
+ continue;
+ }
+ g[e].graph.reset();
+ }
+}
+
+static
+u32 maxDelay(const CompileContext &cc) {
+ if (!cc.streaming) {
+ return MO_INVALID_IDX;
+ }
+ return cc.grey.maxHistoryAvailable;
+}
+
+static
+void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
+ const CompileContext &cc) {
+ STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n");
+
+ vector<RoseInEdge> to_anchor;
+ for (const RoseInEdge &e : edges_range(g)) {
+ RoseInVertex s = source(e, g);
+ RoseInVertex t = target(e, g);
+
+ if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) {
+ continue;
+ }
+
+ if (g[t].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (!g[e].graph) {
+ continue;
+ }
+
+ assert(!g[t].delay);
+ const ue2_literal &lit = g[t].s;
+
+ DEBUG_PRINTF("removing states for literal: %s\n",
+ dumpString(lit).c_str());
+
+ unique_ptr<NGHolder> h = cloneHolder(*g[e].graph);
+ const u32 max_delay = maxDelay(cc);
+
+ u32 delay = removeTrailingLiteralStates(*h, lit, max_delay,
+ false /* can't overhang start */);
+
+ DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay);
+
+ if (edge(h->startDs, h->accept, *h).second) {
+ /* we should have delay == lit.length(), but in really complex
+ * cases we may fail to identify that we can remove the whole
+ * graph. Regardless, the fact that sds is wired to accept means the
+ * graph serves no purpose. */
+ DEBUG_PRINTF("whole graph\n");
+ g[e].graph.reset();
+ continue;
+ }
+
+ if (delay == lit.length() && edge(h->start, h->accept, *h).second
+ && num_vertices(*h) == N_SPECIALS) {
+ to_anchor.push_back(e);
+ continue;
+ }
+
+ /* if we got here we should still have an interesting graph */
+ assert(delay == max_delay || num_vertices(*h) > N_SPECIALS);
+
+ if (delay && delay != MO_INVALID_IDX) {
+ DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
+
+ g[e].graph = move(h);
+ g[e].graph_lag = delay;
+ }
+ }
+
+ if (!to_anchor.empty()) {
+ RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g);
+
+ for (RoseInEdge e : to_anchor) {
+ DEBUG_PRINTF("rehoming to anchor\n");
+ RoseInVertex v = target(e, g);
+ add_edge(anch, v, g);
+ remove_edge(e, g);
+ }
+ }
+}
+
+static
+bool isStarCliche(const NGHolder &g) {
+ DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
+
+ bool nonspecials_seen = false;
+
+ for (auto v : vertices_range(g)) {
+ if (is_special(v, g)) {
+ continue;
+ }
+
+ if (nonspecials_seen) {
+ return false;
+ }
+ nonspecials_seen = true;
+
+ if (!g[v].char_reach.all()) {
+ return false;
+ }
+
+ if (!hasSelfLoop(v, g)) {
+ return false;
+ }
+ if (!edge(v, g.accept, g).second) {
+ return false;
+ }
+ }
+
+ if (!nonspecials_seen) {
+ return false;
+ }
+
+ if (!edge(g.start, g.accept, g).second) {
+ return false;
+ }
+
+ return true;
+}
+
+static
+void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ /* TODO: This could be better by not creating a separate graph for each
+ * successor literal. This would require using distinct report ids and also
+ * taking into account overlap of successor literals. */
+
+ set<ue2_literal> preds;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex u = source(e, ig);
+ assert(ig[u].type == RIV_LITERAL);
+ assert(!ig[e].graph_lag);
+ assert(!ig[u].delay);
+ preds.insert(ig[u].s);
+ }
+
+ set<ue2_literal> succs;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, ig);
+ assert(ig[v].type == RIV_LITERAL);
+ assert(!ig[v].delay);
+ succs.insert(ig[v].s);
+ }
+
+ map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
+
+ for (const ue2_literal &right : succs) {
+ size_t max_overlap = 0;
+ for (const ue2_literal &left : preds) {
+ size_t overlap = maxOverlap(left, right, 0);
+ ENSURE_AT_LEAST(&max_overlap, overlap);
+ }
+
+ u32 max_allowed_delay = right.length() - max_overlap;
+
+ if (cc.streaming) {
+ LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
+ }
+
+ if (!max_allowed_delay) {
+ continue;
+ }
+
+ shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+ u32 delay = removeTrailingLiteralStates(*h_new, right,
+ max_allowed_delay);
+
+ if (delay == MO_INVALID_IDX) {
+ /* successor literal could not match infix -> ignore flase path */
+ assert(0);
+ continue;
+ }
+
+ graphs[right] = make_pair(h_new, delay);
+ }
+
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, ig);
+ const ue2_literal &succ = ig[v].s;
+ if (!contains(graphs, succ)) {
+ continue;
+ }
+
+ ig[e].graph = graphs[succ].first;
+ ig[e].graph_lag = graphs[succ].second;
+
+ if (isStarCliche(*ig[e].graph)) {
+ DEBUG_PRINTF("is a X star!\n");
+ ig[e].graph.reset();
+ ig[e].graph_lag = 0;
+ }
+ }
+}
+
+static
+void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
+ const CompileContext &cc) {
+ map<NGHolder *, vector<RoseInEdge> > infixes;
+
+ for (const RoseInEdge &e : edges_range(g)) {
+ RoseInVertex s = source(e, g);
+ RoseInVertex t = target(e, g);
+
+ if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (!g[e].graph) {
+ continue;
+ }
+
+ assert(!g[t].delay);
+ infixes[&*g[e].graph].push_back(e);
+ }
+
+ for (const auto &info : infixes) {
+ removeRedundantLiteralsFromInfix(*info.first, g, info.second, cc);
+ }
+}
+
+
+static
+void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
+ removeRedundantLiteralsFromPrefixes(g, cc);
+ removeRedundantLiteralsFromInfixes(g, cc);
+}
+
+static
+RoseInVertex getStart(RoseInGraph &vg) {
+ for (RoseInVertex v : vertices_range(vg)) {
+ if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
+ return v;
+ }
+ }
+ assert(0);
+ return RoseInGraph::null_vertex();
+}
+
+/* Finds the intial accept vertex created to which suffix/outfixes are attached
+ */
+static
+RoseInVertex getPrimaryAccept(RoseInGraph &vg) {
+ for (RoseInVertex v : vertices_range(vg)) {
+ if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) {
+ return v;
+ }
+ }
+ assert(0);
+ return RoseInGraph::null_vertex();
+}
+
+static
+bool willBeTransient(const depth &max_depth, const CompileContext &cc) {
+ if (!cc.streaming) {
+ return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH);
+ } else {
+ return max_depth <= depth(cc.grey.maxHistoryAvailable + 1);
+ }
+}
+
+static
+bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) {
+ return max_depth <= depth(grey.maxAnchoredRegion);
+}
+
+static
+unique_ptr<NGHolder> make_chain(u32 count) {
+ assert(count);
+
+ auto rv = make_unique<NGHolder>(NFA_INFIX);
+
+ NGHolder &h = *rv;
+
+ NFAVertex u = h.start;
+ for (u32 i = 0; i < count; i++) {
+ NFAVertex v = add_vertex(h);
+ h[v].char_reach = CharReach::dot();
+ add_edge(u, v, h);
+ u = v;
+ }
+ h[u].reports.insert(0);
+ add_edge(u, h.accept, h);
+
+ return rv;
+}
+
+#define SHORT_TRIGGER_LEN 16
+
+static
+bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ /* check max width and literal lengths to see if possible */
+ size_t min_lit = ~0ULL;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, vg);
+ LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());
+ }
+
+ if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) {
+ return false;
+ }
+
+ depth max_width = findMaxWidth(h);
+
+ u32 delta = min_lit - SHORT_TRIGGER_LEN;
+
+ if (!willBeTransient(max_width - depth(delta), cc)
+ && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) {
+ return false;
+ }
+
+ DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit);
+ DEBUG_PRINTF("delta = %u\n", delta);
+
+ /* try split */
+ map<RoseInVertex, shared_ptr<NGHolder> > graphs;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, vg);
+
+ shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+ u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta);
+
+ DEBUG_PRINTF("delay %u\n", delay);
+
+ if (delay != delta) {
+ DEBUG_PRINTF("unable to trim literal\n");
+ return false;
+ }
+
+ if (in_degree(v, vg) != 1) {
+ DEBUG_PRINTF("complicated\n");
+ return false;
+ }
+
+ DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new));
+ assert(willBeTransient(findMaxWidth(*h_new), cc)
+ || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
+
+ graphs[v] = h_new;
+ }
+
+ /* add .{repeats} from prefixes to long literals */
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex s = source(e, vg);
+ RoseInVertex t = target(e, vg);
+
+ remove_edge(e, vg);
+ const ue2_literal &orig_lit = vg[t].s;
+
+ ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta);
+
+ ue2_literal lit2(orig_lit.end() - delta, orig_lit.end());
+
+ assert(lit.length() + delta == orig_lit.length());
+
+ vg[t].s = lit2;
+
+ RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+ add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg);
+ add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg);
+ }
+
+ DEBUG_PRINTF("success\n");
+ /* TODO: alter split point to avoid pathological splits */
+ return true;
+}
+
+static
+bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
+ num_vertices(h));
+
+ renumber_vertices(h);
+ renumber_edges(h);
+
+ vector<NFAVertexDepth> depths;
+ calcDepths(h, depths);
+
+ /* If the reason the prefix is not transient is due to a very long literal
+ * following, we can make it transient by restricting ourselves to using
+ * just the head of the literal. */
+ if (makeTransientFromLongLiteral(h, vg, ee, cc)) {
+ return true;
+ }
+
+ unique_ptr<VertLitInfo> split = findBestPrefixSplit(h, depths, vg, ee, cc);
+
+ if (split && (split->creates_transient || split->creates_anchored)
+ && splitRoseEdge(h, vg, ee, *split)) {
+ DEBUG_PRINTF("split on simple literal\n");
+ return true;
+ }
+
+ /* large back edges may prevent us identifing anchored or transient cases
+ * properly - use a simple walk instead */
+
+ if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) {
+ return true;
+ }
+
+ if (split && splitRoseEdge(h, vg, ee, *split)) {
+ /* use the simple split even though it doesn't create a transient
+ * prefix */
+ DEBUG_PRINTF("split on simple literal\n");
+ return true;
+ }
+
+ /* look for netflow cuts which don't produce good prefixes */
+ if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) {
+ return true;
+ }
+
+ if (ee.size() > 1) {
+ DEBUG_PRINTF("split the prefix apart based on succ literals\n");
+ unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >,
+ NGHolderHasher, NGHolderEqual> trimmed;
+
+ for (const auto &e : ee) {
+ shared_ptr<NGHolder> hh = cloneHolder(h);
+ auto succ_lit = vg[target(e, vg)].s;
+ u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
+ succ_lit.length(),
+ false /* can't overhang start */);
+ if (!delay) {
+ DEBUG_PRINTF("could not remove any literal, skip over\n");
+ continue;
+ }
+
+ trimmed[hh].emplace_back(e, delay);
+ }
+
+ if (trimmed.size() == 1) {
+ return false;
+ }
+
+ /* shift the contents to a vector so we can modify the graphs without
+ * violating the map's invariants. */
+ vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > >
+ trimmed_vec(trimmed.begin(), trimmed.end());
+ trimmed.clear();
+ for (auto &elem : trimmed_vec) {
+ shared_ptr<NGHolder> &hp = elem.first;
+ NGHolder &h = *hp;
+
+ vector<NFAVertex> base_states;
+ insert(&base_states, base_states.end(),
+ inv_adjacent_vertices(h.accept, h));
+ clear_in_edges(h.accept, h);
+
+ for (auto v : base_states) {
+ h[v].reports.clear(); /* clear report from old accepts */
+ }
+
+ for (const auto &edge_delay : elem.second) {
+ const RoseInEdge &e = edge_delay.first;
+ u32 delay = edge_delay.second;
+ auto succ_lit = vg[target(e, vg)].s;
+
+ vg[e].graph = hp;
+ assert(delay <= succ_lit.length());
+ restoreTrailingLiteralStates(*vg[e].graph, succ_lit, delay,
+ base_states);
+ }
+ }
+ return true;
+ }
+
+ return false;
+}
+
+#define MAX_FIND_BETTER_PREFIX_GEN 4
+#define MAX_FIND_BETTER_PREFIX_COUNT 100
+
+static
+void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
+ STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
+ RoseInVertex start = getStart(vg);
+
+ bool changed;
+ u32 gen = 0;
+ do {
+ DEBUG_PRINTF("gen %u\n", gen);
+ changed = false;
+ vector<NGHolder *> seen_order;
+ map<NGHolder *, vector<RoseInEdge> > prefixes;
+
+ /* find prefixes */
+ for (const RoseInEdge &e : out_edges_range(start, vg)) {
+ /* outfixes shouldn't have made it this far */
+ assert(vg[target(e, vg)].type == RIV_LITERAL);
+ if (vg[e].graph) {
+ NGHolder *h = vg[e].graph.get();
+ if (!contains(prefixes, h)) {
+ seen_order.push_back(h);
+ }
+ prefixes[h].push_back(e);
+ }
+ }
+
+ if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) {
+ break;
+ }
+
+ /* look for bad prefixes and try to split */
+ for (NGHolder *h : seen_order) {
+ depth max_width = findMaxWidth(*h);
+ if (willBeTransient(max_width, cc)
+ || willBeAnchoredTable(max_width, cc.grey)) {
+ continue;
+ }
+
+ changed = improvePrefix(*h, vg, prefixes[h], cc);
+ }
+ } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
+}
+
+#define STRONG_LITERAL_LENGTH 20
+#define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
+
+static
+bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("looking for string literal\n");
+ unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+ if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) {
+ DEBUG_PRINTF("splitting simple literal\n");
+ return splitRoseEdge(h, vg, ee, *split);
+ }
+
+ return false;
+}
+
+static
+void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetExtractStrongLiterals) {
+ return;
+ }
+ STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
+ set<NGHolder *> stuck;
+
+ bool changed;
+ do {
+ changed = false;
+
+ vector<NGHolder *> seen_order;
+ map<NGHolder *, vector<RoseInEdge> > edges_by_graph;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[source(ve, vg)].type != RIV_LITERAL) {
+ continue;
+ }
+ if (vg[ve].graph) {
+ if (!contains(edges_by_graph, vg[ve].graph.get())) {
+ seen_order.push_back(vg[ve].graph.get());
+ }
+ edges_by_graph[vg[ve].graph.get()].push_back(ve);
+ }
+ }
+
+ if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) {
+ DEBUG_PRINTF("too many graphs, stopping\n");
+ return;
+ }
+
+ for (NGHolder *g : seen_order) {
+ if (contains(stuck, g)) {
+ DEBUG_PRINTF("already known to be bad\n");
+ continue;
+ }
+ bool rv = extractStrongLiteral(*g, vg, edges_by_graph[g], cc);
+ if (rv) {
+ changed = true;
+ } else {
+ stuck.insert(g);
+ }
+ }
+ } while (changed);
+}
+
+#define INFIX_STRONG_GUARD_LEN 8
+#define INFIX_MIN_SPLIT_LITERAL_LEN 12
+
+static
+bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+ if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN
+ && splitRoseEdge(h, vg, ee, *split)) {
+ DEBUG_PRINTF("splitting simple literal\n");
+ return true;
+ }
+
+ DEBUG_PRINTF("trying for a netflow cut\n");
+ /* look for netflow cuts which don't produce good prefixes */
+ bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8);
+
+ DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv);
+
+ return rv;
+}
+
+/**
+ * Infixes which are weakly guarded can, in effect, act like prefixes as they
+ * will often be live. We should try to split these infixes further if they
+ * contain strong literals so that we are at least running smaller weak infixes
+ * which can hopeful be accelerated/miracled.
+ */
+static
+void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetAvoidWeakInfixes) {
+ return;
+ }
+ STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n");
+
+ RoseInVertex start = getStart(vg);
+
+ set<NGHolder *> weak;
+ vector<NGHolder *> ordered_weak;
+
+ for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
+ /* outfixes shouldn't have made it this far */
+ assert(vg[vv].type == RIV_LITERAL);
+ if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) {
+ continue;
+ }
+
+ for (const RoseInEdge &e : out_edges_range(vv, vg)) {
+ if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) {
+ continue;
+ }
+
+ NGHolder *h = vg[e].graph.get();
+ DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
+ if (!contains(weak, h)) {
+ weak.insert(h);
+ ordered_weak.push_back(h);
+ }
+ }
+ }
+
+ map<NGHolder *, vector<RoseInEdge> > weak_edges;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (contains(weak, vg[ve].graph.get())) {
+ weak_edges[vg[ve].graph.get()].push_back(ve);
+ }
+ }
+
+ for (NGHolder *h : ordered_weak) {
+ improveInfix(*h, vg, weak_edges[h], cc);
+ }
+}
+
+static
+void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, const VertLitInfo &split,
+ bool eod, const flat_set<ReportID> &reports) {
+ const vector<NFAVertex> &splitters = split.vv;
+ assert(!splitters.empty());
+
+ shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+ unordered_map<NFAVertex, NFAVertex> v_map;
+ cloneHolder(*lhs, base_graph, &v_map);
+ lhs->kind = NFA_INFIX;
+ clear_in_edges(lhs->accept, *lhs);
+ clear_in_edges(lhs->acceptEod, *lhs);
+ add_edge(lhs->accept, lhs->acceptEod, *lhs);
+ clearReports(*lhs);
+ for (NFAVertex v : splitters) {
+ add_edge(v_map[v], lhs->accept, *lhs);
+ (*lhs)[v_map[v]].reports.insert(0);
+ }
+ pruneUseless(*lhs);
+
+ /* create literal vertices and connect preds */
+ for (const auto &lit : split.lit) {
+ if (!can_match(*lhs, lit, is_triggered(*lhs))) {
+ continue;
+ }
+
+ DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+ RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+ RoseInVertex tt;
+ if (eod) {
+ DEBUG_PRINTF("doing eod\n");
+ tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg);
+ } else {
+ DEBUG_PRINTF("doing non-eod\n");
+ tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg);
+ }
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex u = source(e, vg);
+ assert(!edge(u, v, vg).second);
+ add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+ }
+ }
+}
+
+#define MIN_SUFFIX_LEN 6
+
+static
+bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &suffix_edges,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h,
+ suffix_edges.size());
+ /*
+ * We would, in general, rather not have output exposed engines because
+ * once they are triggered, they must be run while infixes only have to run
+ * if the successor literal is seen. Matches from output exposed engines
+ * also have to be placed in a priority queue and interleaved with matches
+ * from other sources.
+ *
+ * Note:
+ * - if the LHS is extremely unlikely we may be better off leaving
+ * a suffix unguarded.
+ *
+ * - limited width suffixes may be less bad as they won't be continuously
+ * active, we may want to have (a) stronger controls on if we want to pick
+ * a trailing literal in these cases and/or (b) look also for literals
+ * near accept as well as right on accept
+ *
+ * TODO: improve heuristics, splitting logic.
+ */
+
+ /* we may do multiple splits corresponding to different report behaviour */
+ set<NFAVertex> seen;
+ map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */
+
+ for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) {
+ set<ue2_literal> ss = getLiteralSet(h, v, false);
+ if (ss.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ return false;
+ }
+
+ VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
+ insert(&vli.lit, ss);
+ vli.vv.push_back(v);
+ seen.insert(v);
+ }
+
+ seen.insert(h.accept);
+ for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) {
+ if (contains(seen, v)) {
+ continue;
+ }
+
+ set<ue2_literal> ss = getLiteralSet(h, v, false);
+ if (ss.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ return false;
+ }
+
+ VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
+ insert(&vli.lit, ss);
+ vli.vv.push_back(v);
+ }
+
+ assert(!by_reports.empty());
+
+ /* TODO: how strong a min len do we want here ? */
+ u32 min_len = cc.grey.minRoseLiteralLength;
+ ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN);
+
+ for (auto &vli : by_reports | map_values) {
+ u64a score = sanitizeAndCompressAndScore(vli.lit);
+
+ if (vli.lit.empty()
+ || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
+ false)) {
+ return false;
+ }
+ }
+
+ for (const auto &info : by_reports) {
+ DEBUG_PRINTF("splitting on simple literals\n");
+ splitEdgesForSuffix(h, vg, suffix_edges, info.second,
+ info.first.first /* eod */,
+ info.first.second /* reports */);
+ }
+
+ for (const RoseInEdge &e : suffix_edges) {
+ remove_edge(e, vg);
+ }
+ return true;
+}
+
+static
+void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetAvoidSuffixes) {
+ return;
+ }
+
+ STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
+
+ RoseInVertex accept = getPrimaryAccept(vg);
+ map<const NGHolder *, vector<RoseInEdge> > suffixes;
+ vector<const NGHolder *> ordered_suffixes;
+
+ /* find suffixes */
+ for (const RoseInEdge &e : in_edges_range(accept, vg)) {
+ /* outfixes shouldn't have made it this far */
+ assert(vg[source(e, vg)].type == RIV_LITERAL);
+ assert(vg[e].graph); /* non suffix paths should be wired to other
+ accepts */
+ const NGHolder *h = vg[e].graph.get();
+ if (!contains(suffixes, h)) {
+ ordered_suffixes.push_back(h);
+ }
+ suffixes[h].push_back(e);
+ }
+
+ /* look at suffixes and try to split */
+ for (const NGHolder *h : ordered_suffixes) {
+ replaceSuffixWithInfix(*h, vg, suffixes[h], cc);
+ }
+}
+
+static
+bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
+ if (out_degree(h.start, h) != 3) {
+ return false;
+ }
+
+ NFAVertex v = NGHolder::null_vertex();
+ NFAVertex ds = NGHolder::null_vertex();
+
+ for (NFAVertex a : adjacent_vertices_range(h.start, h)) {
+ if (a == h.startDs) {
+ continue;
+ }
+ if (h[a].char_reach.all()) {
+ ds = a;
+ if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) {
+ return false;
+ }
+ } else {
+ v = a;
+ }
+ }
+
+ if (!v || !ds || !edge(ds, v, h).second) {
+ return false;
+ }
+
+ if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+ return false;
+ }
+
+ ue2_literal lit;
+ lit.push_back(h[v].char_reach.find_first(),
+ h[v].char_reach.isCaselessChar());
+ while (out_degree(v, h) == 1) {
+ NFAVertex vv = *adjacent_vertices(v, h).first;
+ if (h[vv].char_reach.count() != 1
+ && !h[vv].char_reach.isCaselessChar()) {
+ break;
+ }
+
+ v = vv;
+
+ lit.push_back(h[v].char_reach.find_first(),
+ h[v].char_reach.isCaselessChar());
+ }
+
+ if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) {
+ /* we have rediscovered the post-infix literal */
+ return false;
+ }
+
+ if (bad_mixed_sensitivity(lit)) {
+ make_nocase(&lit);
+ }
+
+ DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+ out->vv = {v};
+ out->lit = {lit};
+ return true;
+}
+
+static
+bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee,
+ RoseInGraph &vg, const Grey &grey) {
+ VertLitInfo info;
+ if (!leadingDotStartLiteral(h, &info)
+ || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+ return false;
+ }
+ DEBUG_PRINTF("performing split\n");
+ return splitRoseEdge(h, vg, ee, {info});
+}
+
+static
+void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetDoubleCut) {
+ return;
+ }
+
+ map<const NGHolder *, vector<RoseInEdge> > right_edges;
+ vector<const NGHolder *> ordered_graphs;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+ const NGHolder *h = vg[ve].graph.get();
+ if (!contains(right_edges, h)) {
+ ordered_graphs.push_back(h);
+ }
+ right_edges[h].push_back(ve);
+ }
+ }
+
+ for (const NGHolder *h : ordered_graphs) {
+ lookForDoubleCut(*h, right_edges[h], vg, cc.grey);
+ }
+}
+
+static
+pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) {
+ ue2_literal lit;
+ if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+ return {v, std::move(lit) };
+ }
+ lit.push_back(h[v].char_reach.find_first(),
+ h[v].char_reach.isCaselessChar());
+
+ while (in_degree(v, h) == 1) {
+ NFAVertex vv = *inv_adjacent_vertices(v, h).first;
+ if (h[vv].char_reach.count() != 1
+ && !h[vv].char_reach.isCaselessChar()) {
+ break;
+ }
+
+ lit.push_back(h[vv].char_reach.find_first(),
+ h[vv].char_reach.isCaselessChar());
+ v = vv;
+ }
+
+ return {v, std::move(lit) };
+}
+
+static
+bool lookForDotStarPred(NFAVertex v, const NGHolder &h,
+ NFAVertex *u, NFAVertex *ds) {
+ *u = NGHolder::null_vertex();
+ *ds = NGHolder::null_vertex();
+ for (NFAVertex a : inv_adjacent_vertices_range(v, h)) {
+ if (h[a].char_reach.all()) {
+ if (!edge(a, a, h).second) {
+ return false;
+ }
+
+ if (*ds) {
+ return false;
+ }
+
+ *ds = a;
+ } else {
+ if (*u) {
+ return false;
+ }
+ *u = a;
+ }
+ }
+
+ if (!*u || !*ds) {
+ return false;
+ }
+
+ return true;
+}
+
+static
+bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
+ /* Note: there is no delay yet - so the final literal is the already
+ * discovered successor literal - we are in fact interested in the literal
+ * before it. */
+
+ if (in_degree(h.accept, h) != 1) {
+ return false;
+ }
+
+ if (in_degree(h.acceptEod, h) != 1) {
+ assert(0);
+ return false;
+ }
+
+ NFAVertex v
+ = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first;
+
+ NFAVertex u;
+ NFAVertex ds;
+
+ if (!lookForDotStarPred(v, h, &u, &ds)) {
+ return false;
+ }
+
+ v = u;
+ auto rv = findLiteralBefore(h, v);
+
+ if (!lookForDotStarPred(v, h, &u, &ds)) {
+ return false;
+ }
+
+ ue2_literal lit = reverse_literal(rv.second);
+ DEBUG_PRINTF("%u found %s\n", h[v].index, dumpString(lit).c_str());
+
+ if (bad_mixed_sensitivity(lit)) {
+ make_nocase(&lit);
+ }
+
+ out->vv = {v};
+ out->lit = {lit};
+ return true;
+}
+
+static
+bool lookForTrailingLiteralDotStar(const NGHolder &h,
+ const vector<RoseInEdge> &ee,
+ RoseInGraph &vg, const Grey &grey) {
+ VertLitInfo info;
+ if (!trailingDotStarLiteral(h, &info)
+ || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+ return false;
+ }
+ DEBUG_PRINTF("performing split\n");
+ return splitRoseEdge(h, vg, ee, info);
+}
+
+/* In streaming mode, active engines have to be caught up at stream boundaries
+ * and have to be stored in stream state, so we prefer to decompose patterns
+ * in to literals with no state between them if possible. */
+static
+void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetLiteralChains) {
+ return;
+ }
+
+ bool changed;
+ do {
+ changed = false;
+
+ map<const NGHolder *, vector<RoseInEdge> > right_edges;
+ vector<const NGHolder *> ordered_graphs;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+ const NGHolder *h = vg[ve].graph.get();
+ if (!contains(right_edges, h)) {
+ ordered_graphs.push_back(h);
+ }
+ right_edges[h].push_back(ve);
+ }
+ }
+
+ for (const NGHolder *h : ordered_graphs) {
+ const vector<RoseInEdge> &ee = right_edges[h];
+ bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
+ if (!rv && h->kind != NFA_SUFFIX) {
+ rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
+ }
+ changed |= rv;
+ }
+ } while (changed);
+}
+
+static
+bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
+ RoseInGraph &vg, const CompileContext &cc) {
+ unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc);
+
+ if (split) {
+ return splitRoseEdge(h, vg, {ee}, *split);
+ }
+
+ return false;
+}
+
+#define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4
+
+static
+void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
+ u32 gen = 0;
+ set<RoseInVertex> prev = {getStart(vg)};
+
+ while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
+ set<RoseInVertex> curr;
+ for (RoseInVertex u : prev) {
+ insert(&curr, adjacent_vertices(u, vg));
+ }
+
+ map<const NGHolder *, vector<RoseInEdge> > rightfixes;
+ vector<NGHolder *> ordered_graphs;
+ for (RoseInVertex v : curr) {
+ for (const RoseInEdge &e : out_edges_range(v, vg)) {
+ if (vg[e].graph) {
+ NGHolder *h = vg[e].graph.get();
+ if (!contains(rightfixes, h)) {
+ ordered_graphs.push_back(h);
+ }
+ rightfixes[h].push_back(e);
+ }
+ }
+ }
+
+ for (const NGHolder *h : ordered_graphs) {
+ lookForCleanSplit(*h, rightfixes[h], vg, cc);
+ }
+
+ prev = curr;
+ gen++;
+ }
+}
+
+static
+void rehomeEodSuffixes(RoseInGraph &vg) {
+ // Find edges to accept with EOD-anchored graphs that we can move over to
+ // acceptEod.
+ vector<RoseInEdge> acc_edges;
+ for (const auto &e : edges_range(vg)) {
+ if (vg[target(e, vg)].type != RIV_ACCEPT) {
+ continue;
+ }
+ if (vg[e].haig || !vg[e].graph) {
+ continue;
+ }
+
+ const NGHolder &h = *vg[e].graph;
+
+ if (in_degree(h.accept, h)) {
+ DEBUG_PRINTF("graph isn't eod anchored\n");
+ continue;
+ }
+
+ acc_edges.push_back(e);
+ }
+
+ for (const RoseInEdge &e : acc_edges) {
+ // Move this edge from accept to acceptEod
+ RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg);
+ add_edge(source(e, vg), w, vg[e], vg);
+ remove_edge(e, vg);
+ }
+
+ /* old accept vertices will be tidied up by final pruneUseless() call */
+}
+
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+ const CompileContext &cc) {
+ assert(!can_never_match(h));
+
+ if (!cc.grey.allowViolet) {
+ return false;
+ }
+
+ DEBUG_PRINTF("hello world\n");
+
+ RoseInGraph vg = populateTrivialGraph(h);
+
+ /* Step 1: avoid outfixes as we always have to run them. */
+ avoidOutfixes(vg, cc);
+
+ if (num_vertices(vg) <= 2) {
+ /* only have an outfix; leave for ng_rose for now */
+ return false;
+ }
+
+ removeRedundantPrefixes(vg);
+ dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot");
+
+ /* Step 2: avoid non-transient prefixes (esp in streaming mode) */
+ findBetterPrefixes(vg, cc);
+
+ dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot");
+
+ extractStrongLiterals(vg, cc);
+ dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot");
+ improveWeakInfixes(vg, cc);
+ dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot");
+
+ /* Step 3: avoid output exposed engines if there is a strong trailing
+ literal) */
+ avoidSuffixes(vg, cc);
+
+ /* Step 4: look for infixes/suffixes with leading .*literals
+ * This can reduce the amount of work a heavily picked literal has to do and
+ * reduce the amount of state used as .* is handled internally to rose. */
+ lookForDoubleCut(vg, cc);
+
+ if (cc.streaming) {
+ lookForCleanEarlySplits(vg, cc);
+ decomposeLiteralChains(vg, cc);
+ }
+
+ /* Step 5: avoid unimplementable, or overly large engines if possible */
+ /* TODO: later - ng_rose is currently acting as a backstop */
+
+ /* Step 6: send to rose */
+ rehomeEodSuffixes(vg);
+ removeRedundantLiterals(vg, cc);
+
+ pruneUseless(vg);
+ dumpPreRoseGraph(vg, cc.grey);
+ calcVertexOffsets(vg);
+ bool rv = rose.addRose(vg, prefilter);
+ DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
+ return rv;
+}
+
+}