]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
Rose build: move HWLM build code to own file
authorJustin Viiret <justin.viiret@intel.com>
Wed, 2 Mar 2016 02:05:33 +0000 (13:05 +1100)
committerMatthew Barr <matthew.barr@intel.com>
Wed, 20 Apr 2016 03:34:54 +0000 (13:34 +1000)
To reduce the size of rose_build_bytecode.cpp a little, move the code
that deals with HWLM literal tables into its own new file.

CMakeLists.txt
src/rose/rose_build_bytecode.cpp
src/rose/rose_build_dump.cpp
src/rose/rose_build_impl.h
src/rose/rose_build_matchers.cpp [new file with mode: 0644]
src/rose/rose_build_matchers.h [new file with mode: 0644]

index 49c9a4add880d289d1553192ca549ef7f482d4d2..36267fc618c2469a06524b8aefa524e9df118aac 100644 (file)
@@ -810,6 +810,8 @@ SET (hs_SRCS
     src/rose/rose_build_infix.h
     src/rose/rose_build_lookaround.cpp
     src/rose/rose_build_lookaround.h
+    src/rose/rose_build_matchers.cpp
+    src/rose/rose_build_matchers.h
     src/rose/rose_build_merge.cpp
     src/rose/rose_build_merge.h
     src/rose/rose_build_misc.cpp
index 401c15c1595a737f7a66f90c67abfadf454682e5..c2d9bccedb7fa023e2e90fcd9b78f1e35759423b 100644 (file)
 #include "rose_build_anchored.h"
 #include "rose_build_infix.h"
 #include "rose_build_lookaround.h"
+#include "rose_build_matchers.h"
 #include "rose_build_scatter.h"
 #include "rose_build_util.h"
 #include "rose_build_width.h"
 #include "rose_program.h"
 #include "hwlm/hwlm.h" /* engine types */
-#include "hwlm/hwlm_build.h"
 #include "nfa/castlecompile.h"
 #include "nfa/goughcompile.h"
 #include "nfa/mcclellancompile.h"
@@ -68,7 +68,6 @@
 #include "util/compile_error.h"
 #include "util/container.h"
 #include "util/graph_range.h"
-#include "util/dump_charclass.h"
 #include "util/internal_report.h"
 #include "util/multibit_build.h"
 #include "util/order_check.h"
 #include "util/verify_types.h"
 
 #include <algorithm>
-#include <iomanip>
 #include <map>
 #include <queue>
 #include <set>
-#include <sstream>
 #include <string>
 #include <vector>
 #include <utility>
@@ -1674,619 +1671,6 @@ u32 RoseBuildImpl::calcHistoryRequired() const {
     return m ? m - 1 : 0;
 }
 
-#ifdef DEBUG
-static UNUSED
-string dumpMask(const vector<u8> &v) {
-    ostringstream oss;
-    for (u8 e : v) {
-        oss << setfill('0') << setw(2) << hex << (unsigned int)e;
-    }
-    return oss.str();
-}
-#endif
-
-static
-bool maskFromLeftGraph(const LeftEngInfo &left, vector<u8> &msk,
-                       vector<u8> &cmp) {
-    const u32 lag = left.lag;
-    const ReportID report = left.leftfix_report;
-
-    DEBUG_PRINTF("leftfix with lag %u, report %u\n", lag, report);
-
-    assert(left.graph);
-    const NGHolder &h = *left.graph;
-    assert(in_degree(h.acceptEod, h) == 1); // no eod reports
-
-    // Start with the set of reporter vertices for this leftfix.
-    set<NFAVertex> curr;
-    for (auto u : inv_adjacent_vertices_range(h.accept, h)) {
-        if (contains(h[u].reports, report)) {
-            curr.insert(u);
-        }
-    }
-    assert(!curr.empty());
-
-    size_t i = HWLM_MASKLEN - lag - 1;
-    do {
-        if (curr.empty() || contains(curr, h.start)
-            || contains(curr, h.startDs)) {
-            DEBUG_PRINTF("end of the road\n");
-            break;
-        }
-
-        set<NFAVertex> next;
-        CharReach cr;
-        for (NFAVertex v : curr) {
-            const auto &v_cr = h[v].char_reach;
-            DEBUG_PRINTF("vertex %u, reach %s\n", h[v].index,
-                         describeClass(v_cr).c_str());
-            cr |= v_cr;
-            insert(&next, inv_adjacent_vertices(v, h));
-        }
-        make_and_cmp_mask(cr, &msk.at(i), &cmp.at(i));
-        DEBUG_PRINTF("%zu: reach=%s, msk=%u, cmp=%u\n", i,
-                     describeClass(cr).c_str(), msk[i], cmp[i]);
-        curr.swap(next);
-    } while (i-- > 0);
-
-    return true;
-}
-
-static
-bool maskFromLeftCastle(const LeftEngInfo &left, vector<u8> &msk,
-                        vector<u8> &cmp) {
-    const u32 lag = left.lag;
-    const ReportID report = left.leftfix_report;
-
-    DEBUG_PRINTF("leftfix with lag %u, report %u\n", lag, report);
-
-    assert(left.castle);
-    const CastleProto &c = *left.castle;
-
-    depth min_width(depth::infinity());
-    for (const PureRepeat &repeat : c.repeats | map_values) {
-        if (contains(repeat.reports, report)) {
-            min_width = min(min_width, repeat.bounds.min);
-        }
-    }
-
-    DEBUG_PRINTF("castle min width for this report is %s\n",
-                 min_width.str().c_str());
-
-    if (!min_width.is_finite() || min_width == depth(0)) {
-        DEBUG_PRINTF("bad min width\n");
-        return false;
-    }
-
-    u32 len = min_width;
-    u32 end = HWLM_MASKLEN - lag;
-    for (u32 i = end; i > end - min(end, len); i--) {
-        make_and_cmp_mask(c.reach(), &msk.at(i - 1), &cmp.at(i - 1));
-    }
-
-    return true;
-}
-
-static
-bool maskFromLeft(const LeftEngInfo &left, vector<u8> &msk, vector<u8> &cmp) {
-    if (left.lag >= HWLM_MASKLEN) {
-        DEBUG_PRINTF("too much lag\n");
-        return false;
-    }
-
-    if (left.graph) {
-        return maskFromLeftGraph(left, msk, cmp);
-    } else if (left.castle) {
-        return maskFromLeftCastle(left, msk, cmp);
-    }
-
-    return false;
-}
-
-static
-bool maskFromPreds(const RoseBuildImpl &tbi, const rose_literal_id &id,
-                   const RoseVertex v, vector<u8> &msk, vector<u8> &cmp) {
-    const RoseGraph &g = tbi.g;
-
-    // For right now, wuss out and only handle cases with one pred.
-    if (in_degree(v, g) != 1) {
-        return false;
-    }
-
-    // Root successors have no literal before them.
-    if (tbi.isRootSuccessor(v)) {
-        return false;
-    }
-
-    // If we have a single predecessor with a short bound, we may be able to
-    // fill out a mask with the trailing bytes of the previous literal. This
-    // allows us to improve literals like the 'bar' in 'fo.bar'.
-
-    RoseEdge e = *(in_edges(v, g).first);
-    u32 bound = g[e].maxBound;
-    if (bound != g[e].minBound || bound >= HWLM_MASKLEN) {
-        return false;
-    }
-
-    bound += id.s.length();
-    if (bound >= HWLM_MASKLEN) {
-        return false;
-    }
-
-    DEBUG_PRINTF("bound %u\n", bound);
-
-    RoseVertex u = source(e, g);
-    if (g[u].literals.size() != 1) {
-        DEBUG_PRINTF("u has %zu literals\n", g[u].literals.size());
-        return false;
-    }
-
-    u32 u_lit_id = *(g[u].literals.begin());
-    const rose_literal_id &u_id = tbi.literals.right.at(u_lit_id);
-    DEBUG_PRINTF("u has lit: %s\n", escapeString(u_id.s).c_str());
-
-    // Number of characters to take from the back of u's literal.
-    size_t u_len = u_id.s.length();
-    size_t u_sublen = min(u_len, (size_t)HWLM_MASKLEN - bound);
-
-    size_t i = HWLM_MASKLEN - (bound + u_sublen);
-
-    ue2_literal::const_iterator it, ite;
-    for (it = u_id.s.begin() + (u_len - u_sublen), ite = u_id.s.end();
-            it != ite; ++it) {
-        make_and_cmp_mask(*it, &msk.at(i), &cmp.at(i));
-        ++i;
-    }
-
-    return true;
-}
-
-static
-bool findHamsterMask(const RoseBuildImpl &tbi, const rose_literal_id &id,
-                     const rose_literal_info &info, const RoseVertex v,
-                     vector<u8> &msk, vector<u8> &cmp) {
-    // Start with zero masks.
-    msk.assign(HWLM_MASKLEN, 0);
-    cmp.assign(HWLM_MASKLEN, 0);
-
-    // Masks can come from literal benefits (for mixed-case literals).
-    if (info.requires_benefits) {
-        assert(mixed_sensitivity(id.s));
-
-        size_t j = 0;
-        for (ue2_literal::const_reverse_iterator it = id.s.rbegin(),
-                                                 ite = id.s.rend();
-             it != ite && j < HWLM_MASKLEN; ++it, ++j) {
-            size_t offset = HWLM_MASKLEN - j - 1;
-            const CharReach &cr = *it;
-            make_and_cmp_mask(cr, &msk[offset], &cmp[offset]);
-        }
-        return true;
-    }
-
-    const LeftEngInfo &left = tbi.g[v].left;
-    if (left && left.lag < HWLM_MASKLEN) {
-        if (maskFromLeft(left, msk, cmp)) {
-            DEBUG_PRINTF("mask from a leftfix!\n");
-            return true;
-        }
-    }
-
-    if (id.s.length() < HWLM_MASKLEN) {
-        if (maskFromPreds(tbi, id, v, msk, cmp)) {
-            DEBUG_PRINTF("mask from preds!\n");
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static
-bool hamsterMaskCombine(vector<u8> &msk, vector<u8> &cmp,
-                        const vector<u8> &v_msk, const vector<u8> &v_cmp) {
-    assert(msk.size() == HWLM_MASKLEN && cmp.size() == HWLM_MASKLEN);
-    assert(v_msk.size() == HWLM_MASKLEN && v_cmp.size() == HWLM_MASKLEN);
-
-    u8 all_masks = 0;
-
-    for (size_t i = 0; i < HWLM_MASKLEN; i++) {
-        u8 filter = ~(cmp[i] ^ v_cmp[i]);
-        msk[i] &= v_msk[i];
-        msk[i] &= filter;
-        cmp[i] &= filter;
-
-        all_masks |= msk[i];
-    }
-
-    // Return false if we have no bits on in any mask elements.
-    return all_masks != 0;
-}
-
-static
-bool findHamsterMask(const RoseBuildImpl &tbi, const rose_literal_id &id,
-                     const rose_literal_info &info,
-                     vector<u8> &msk, vector<u8> &cmp) {
-    if (!tbi.cc.grey.roseHamsterMasks) {
-        return false;
-    }
-
-    if (!info.delayed_ids.empty()) {
-        // Not safe to add masks to delayed literals at this late stage.
-        return false;
-    }
-
-    size_t num = 0;
-    vector<u8> v_msk, v_cmp;
-
-    for (RoseVertex v : info.vertices) {
-        if (!findHamsterMask(tbi, id, info, v, v_msk, v_cmp)) {
-            DEBUG_PRINTF("no mask\n");
-            return false;
-        }
-
-        if (!num++) {
-            // First (or only) vertex, this becomes the mask/cmp pair.
-            msk = v_msk;
-            cmp = v_cmp;
-        } else {
-            // Multiple vertices with potentially different masks. We combine
-            // them into an 'advisory' mask.
-            if (!hamsterMaskCombine(msk, cmp, v_msk, v_cmp)) {
-                DEBUG_PRINTF("mask went to zero\n");
-                return false;
-            }
-        }
-    }
-
-    normaliseLiteralMask(id.s, msk, cmp);
-
-    if (msk.empty()) {
-        DEBUG_PRINTF("no mask\n");
-        return false;
-    }
-
-    DEBUG_PRINTF("msk=%s, cmp=%s\n", dumpMask(msk).c_str(),
-                 dumpMask(cmp).c_str());
-    return true;
-}
-
-static
-bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
-                        const rose_literal_info &info) {
-    if (!build.isDirectReport(id)) {
-        return false;
-    }
-
-    auto is_simple_exhaustible = [&build](ReportID id) {
-        const Report &report = build.rm.getReport(id);
-        return isSimpleExhaustible(report);
-    };
-
-    assert(!info.vertices.empty());
-    for (const auto &v : info.vertices) {
-        const auto &reports = build.g[v].reports;
-        assert(!reports.empty());
-        if (!all_of(begin(reports), end(reports),
-                    is_simple_exhaustible)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-// Called by isNoRunsLiteral below.
-static
-bool isNoRunsVertex(const RoseBuildImpl &tbi, NFAVertex u) {
-    const RoseGraph &g = tbi.g;
-    if (!g[u].isBoring()) {
-        DEBUG_PRINTF("u=%zu is not boring\n", g[u].idx);
-        return false;
-    }
-
-    if (!g[u].reports.empty()) {
-        DEBUG_PRINTF("u=%zu has accept\n", g[u].idx);
-        return false;
-    }
-
-    /* TODO: handle non-root roles as well. It can't be that difficult... */
-
-    if (!in_degree_equal_to(u, g, 1)) {
-        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].idx);
-        return false;
-    }
-
-    RoseEdge e;
-    bool exists;
-    tie(e, exists) = edge_by_target(tbi.root, u, g);
-
-    if (!exists) {
-        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].idx);
-        return false;
-    }
-
-    if (g[e].minBound != 0 || g[e].maxBound != ROSE_BOUND_INF) {
-        DEBUG_PRINTF("u=%zu has bounds from root\n", g[u].idx);
-        return false;
-    }
-
-    for (const auto &oe : out_edges_range(u, g)) {
-        RoseVertex v = target(oe, g);
-        if (g[oe].maxBound != ROSE_BOUND_INF) {
-            DEBUG_PRINTF("edge (%zu,%zu) has max bound\n", g[u].idx,
-                    g[target(oe, g)].idx);
-            return false;
-        }
-        if (g[v].left) {
-            DEBUG_PRINTF("v=%zu has rose prefix\n", g[v].idx);
-            return false;
-        }
-    }
-    return true;
-}
-
-static
-bool isNoRunsLiteral(const RoseBuildImpl &tbi, const u32 id,
-                     const rose_literal_info &info) {
-    DEBUG_PRINTF("lit id %u\n", id);
-
-    if (info.requires_benefits) {
-        DEBUG_PRINTF("requires benefits\n"); // which would need confirm
-        return false;
-    }
-
-    if (isDirectHighlander(tbi, id, info)) {
-        DEBUG_PRINTF("highlander direct report\n");
-        return true;
-    }
-
-    // Undelayed vertices.
-    for (RoseVertex v : info.vertices) {
-        if (!isNoRunsVertex(tbi, v)) {
-            return false;
-        }
-    }
-
-    // Delayed vertices.
-    for (u32 d : info.delayed_ids) {
-        assert(d < tbi.literal_info.size());
-        const rose_literal_info &delayed_info = tbi.literal_info.at(d);
-        assert(delayed_info.undelayed_id == id);
-        for (RoseVertex v : delayed_info.vertices) {
-            if (!isNoRunsVertex(tbi, v)) {
-                return false;
-            }
-        }
-    }
-
-    DEBUG_PRINTF("is no-runs literal\n");
-    return true;
-}
-
-void fillHamsterLiteralList(const RoseBuildImpl &tbi, rose_literal_table table,
-                            vector<hwlmLiteral> *hl) {
-    for (const auto &e : tbi.literals.right) {
-        const u32 id = e.first;
-        if (!tbi.hasFinalId(id)) {
-            continue;
-        }
-
-        if (e.second.delay) {
-            continue; /* delay id's are virtual-ish */
-        }
-
-        if (e.second.table != table) {
-            continue; /* wrong table */
-        }
-
-        assert(id < tbi.literal_info.size());
-        const rose_literal_info &info = tbi.literal_info[id];
-        u32 final_id = info.final_id;
-        rose_group groups = info.group_mask;
-        /* Note: requires_benefits are handled in the literal entries */
-        const ue2_literal &lit = e.second.s;
-
-        DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
-
-        vector<u8> msk = e.second.msk; // copy
-        vector<u8> cmp = e.second.cmp; // copy
-
-        if (msk.empty()) {
-            // Try and pick up an advisory mask.
-            if (!findHamsterMask(tbi, e.second, info, msk, cmp)) {
-                msk.clear(); cmp.clear();
-            } else {
-                DEBUG_PRINTF("picked up late mask %zu\n", msk.size());
-            }
-        }
-
-        bool noruns = isNoRunsLiteral(tbi, id, info);
-
-        if (info.requires_explode) {
-            DEBUG_PRINTF("exploding lit\n");
-            const vector<u8> empty_msk; // msk/cmp will be empty
-            case_iter cit = caseIterateBegin(lit);
-            case_iter cite = caseIterateEnd();
-            for (; cit != cite; ++cit) {
-                DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d msk=%s, "
-                             "cmp=%s (exploded)\n",
-                             final_id, escapeString(lit.get_string()).c_str(),
-                             0, noruns, dumpMask(msk).c_str(),
-                             dumpMask(cmp).c_str());
-                hl->push_back(hwlmLiteral(*cit, false, noruns, final_id, groups,
-                                          empty_msk, empty_msk));
-            }
-        } else {
-            const std::string &s = lit.get_string();
-            const bool nocase = lit.any_nocase();
-
-            DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
-                         "cmp=%s\n",
-                         final_id, escapeString(s).c_str(), (int)nocase, noruns,
-                         dumpMask(msk).c_str(), dumpMask(cmp).c_str());
-
-            if (!maskIsConsistent(s, nocase, msk, cmp)) {
-                DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
-                continue;
-            }
-
-            hl->push_back(hwlmLiteral(lit.get_string(), lit.any_nocase(),
-                                      noruns, final_id, groups, msk, cmp));
-        }
-    }
-}
-
-static
-aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &tbi,
-                                              size_t *fsize,
-                                              size_t *historyRequired,
-                                              size_t *streamStateRequired) {
-    *fsize = 0;
-
-    vector<hwlmLiteral> fl;
-    fl.reserve(tbi.literals.size());
-    fillHamsterLiteralList(tbi, ROSE_FLOATING, &fl);
-    if (fl.empty()) {
-        DEBUG_PRINTF("empty floating matcher\n");
-        return nullptr;
-    }
-
-    hwlmStreamingControl ctl;
-    hwlmStreamingControl *ctlp;
-    if (tbi.cc.streaming) {
-        ctl.history_max = tbi.cc.grey.maxHistoryAvailable;
-        ctl.history_min = MAX(*historyRequired,
-                              tbi.cc.grey.minHistoryAvailable);
-        DEBUG_PRINTF("streaming control, history max=%zu, min=%zu\n",
-                     ctl.history_max, ctl.history_min);
-        ctlp = &ctl;
-    } else {
-        ctlp = nullptr; // Null for non-streaming.
-    }
-
-    aligned_unique_ptr<HWLM> ftable =
-        hwlmBuild(fl, ctlp, false, tbi.cc, tbi.getInitialGroups());
-    if (!ftable) {
-        throw CompileError("Unable to generate bytecode.");
-    }
-
-    if (tbi.cc.streaming) {
-        DEBUG_PRINTF("literal_history_required=%zu\n",
-                ctl.literal_history_required);
-        DEBUG_PRINTF("literal_stream_state_required=%zu\n",
-                ctl.literal_stream_state_required);
-        assert(ctl.literal_history_required <= tbi.cc.grey.maxHistoryAvailable);
-        *historyRequired = max(*historyRequired,
-                ctl.literal_history_required);
-        *streamStateRequired = ctl.literal_stream_state_required;
-    }
-
-    *fsize = hwlmSize(ftable.get());
-    assert(*fsize);
-    DEBUG_PRINTF("built floating literal table size %zu bytes\n", *fsize);
-    return ftable;
-}
-
-namespace {
-struct LongerThanLimit {
-    explicit LongerThanLimit(size_t len) : max_len(len) {}
-    bool operator()(const hwlmLiteral &lit) const {
-        return lit.s.length() > max_len;
-    }
-private:
-    size_t max_len;
-};
-}
-
-static
-aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &tbi,
-                                                size_t *sbsize) {
-    *sbsize = 0;
-
-    if (tbi.cc.streaming) {
-        DEBUG_PRINTF("streaming mode\n");
-        return nullptr;
-    }
-
-    u32 float_min = findMinWidth(tbi, ROSE_FLOATING);
-    if (float_min > ROSE_SMALL_BLOCK_LEN) {
-        DEBUG_PRINTF("floating table has large min width %u, fail\n", float_min);
-        return nullptr;
-    }
-
-    vector<hwlmLiteral> lits;
-    fillHamsterLiteralList(tbi, ROSE_FLOATING, &lits);
-    if (lits.empty()) {
-        DEBUG_PRINTF("no floating table\n");
-        return nullptr;
-    } else if (lits.size() == 1) {
-        DEBUG_PRINTF("single floating literal, noodle will be fast enough\n");
-        return nullptr;
-    }
-
-    vector<hwlmLiteral> anchored_lits;
-    fillHamsterLiteralList(tbi, ROSE_ANCHORED_SMALL_BLOCK, &anchored_lits);
-    if (anchored_lits.empty()) {
-        DEBUG_PRINTF("no small-block anchored literals\n");
-        return nullptr;
-    }
-
-    lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end());
-
-    // Remove literals that are longer than our small block length, as they can
-    // never match. TODO: improve by removing literals that have a min match
-    // offset greater than ROSE_SMALL_BLOCK_LEN, which will catch anchored cases
-    // with preceding dots that put them over the limit.
-    lits.erase(std::remove_if(lits.begin(), lits.end(),
-                              LongerThanLimit(ROSE_SMALL_BLOCK_LEN)),
-               lits.end());
-
-    if (lits.empty()) {
-        DEBUG_PRINTF("no literals shorter than small block len\n");
-        return nullptr;
-    }
-
-    aligned_unique_ptr<HWLM> hwlm =
-        hwlmBuild(lits, nullptr, true, tbi.cc, tbi.getInitialGroups());
-    if (!hwlm) {
-        throw CompileError("Unable to generate bytecode.");
-    }
-
-    *sbsize = hwlmSize(hwlm.get());
-    assert(*sbsize);
-    DEBUG_PRINTF("built small block literal table size %zu bytes\n", *sbsize);
-    return hwlm;
-}
-
-static
-aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &tbi,
-                                                 size_t *esize) {
-    *esize = 0;
-
-    vector<hwlmLiteral> el;
-    fillHamsterLiteralList(tbi, ROSE_EOD_ANCHORED, &el);
-
-    if (el.empty()) {
-        DEBUG_PRINTF("no eod anchored literals\n");
-        assert(!tbi.ematcher_region_size);
-        return nullptr;
-    }
-
-    assert(tbi.ematcher_region_size);
-
-    hwlmStreamingControl *ctlp = nullptr; // not a streaming case
-    aligned_unique_ptr<HWLM> etable =
-        hwlmBuild(el, ctlp, true, tbi.cc, tbi.getInitialGroups());
-    if (!etable) {
-        throw CompileError("Unable to generate bytecode.");
-    }
-
-    *esize = hwlmSize(etable.get());
-    assert(*esize);
-    DEBUG_PRINTF("built eod-anchored literal table size %zu bytes\n", *esize);
-    return etable;
-}
-
 // Adds a sparse iterator to the end of the iterator table, returning its
 // offset.
 static
index cd32749ebeb02b42962003f7b871b1909909467f..e56f322b391cfb59c7c29eaf49f317e44ead8261 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,6 +32,7 @@
 
 #include "hwlm/hwlm_build.h"
 #include "rose_build_impl.h"
+#include "rose_build_matchers.h"
 #include "rose/rose_dump.h"
 #include "rose_internal.h"
 #include "ue2common.h"
@@ -458,23 +459,18 @@ struct LongerThanLimit {
 
 static
 void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
-
-    vector<hwlmLiteral> lits;
-
-    fillHamsterLiteralList(build, ROSE_ANCHORED, &lits);
+    auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED);
     dumpTestLiterals(base + "rose_anchored_test_literals.txt", lits);
 
-    lits.clear();
-    fillHamsterLiteralList(build, ROSE_FLOATING, &lits);
+    lits = fillHamsterLiteralList(build, ROSE_FLOATING);
     dumpTestLiterals(base + "rose_float_test_literals.txt", lits);
 
-    lits.clear();
-    fillHamsterLiteralList(build, ROSE_EOD_ANCHORED, &lits);
+    lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
     dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
 
-    lits.clear();
-    fillHamsterLiteralList(build, ROSE_FLOATING, &lits);
-    fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK, &lits);
+    lits = fillHamsterLiteralList(build, ROSE_FLOATING);
+    auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK);
+    lits.insert(end(lits), begin(lits2), end(lits2));
     lits.erase(remove_if(lits.begin(), lits.end(),
                          LongerThanLimit(ROSE_SMALL_BLOCK_LEN)),
                lits.end());
index 592476459da66187be21fb2188d59bd786d855ea..7b59c59e96be7ae425c1a5f43d2d5b8398b25bd3 100644 (file)
@@ -58,7 +58,6 @@ namespace ue2 {
 struct BoundaryReports;
 struct CastleProto;
 struct CompileContext;
-struct hwlmLiteral;
 class ReportManager;
 class SomSlotManager;
 
@@ -550,9 +549,6 @@ u64a findMaxOffset(const std::set<ReportID> &reports, const ReportManager &rm);
 void normaliseLiteralMask(const ue2_literal &s, std::vector<u8> &msk,
                           std::vector<u8> &cmp);
 
-void fillHamsterLiteralList(const RoseBuildImpl &tbi, rose_literal_table table,
-                            std::vector<hwlmLiteral> *hl);
-
 #ifndef NDEBUG
 bool canImplementGraphs(const RoseBuildImpl &tbi);
 #endif
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
new file mode 100644 (file)
index 0000000..83c4955
--- /dev/null
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose build: code for constructing literal tables.
+ */
+
+#include "rose_build_matchers.h"
+
+#include "rose_build_impl.h"
+#include "rose_build_width.h"
+#include "hwlm/hwlm_build.h"
+#include "hwlm/hwlm_literal.h"
+#include "nfa/castlecompile.h"
+#include "util/charreach_util.h"
+#include "util/compile_context.h"
+#include "util/compile_error.h"
+#include "util/dump_charclass.h"
+#include "util/report.h"
+#include "util/report_manager.h"
+#include "ue2common.h"
+
+#include <iomanip>
+#include <sstream>
+
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using boost::adaptors::map_values;
+
+namespace ue2 {
+
+#ifdef DEBUG
+static UNUSED
+string dumpMask(const vector<u8> &v) {
+    ostringstream oss;
+    for (u8 e : v) {
+        oss << setfill('0') << setw(2) << hex << (unsigned int)e;
+    }
+    return oss.str();
+}
+#endif
+
+static
+bool maskFromLeftGraph(const LeftEngInfo &left, vector<u8> &msk,
+                       vector<u8> &cmp) {
+    const u32 lag = left.lag;
+    const ReportID report = left.leftfix_report;
+
+    DEBUG_PRINTF("leftfix with lag %u, report %u\n", lag, report);
+
+    assert(left.graph);
+    const NGHolder &h = *left.graph;
+    assert(in_degree(h.acceptEod, h) == 1); // no eod reports
+
+    // Start with the set of reporter vertices for this leftfix.
+    set<NFAVertex> curr;
+    for (auto u : inv_adjacent_vertices_range(h.accept, h)) {
+        if (contains(h[u].reports, report)) {
+            curr.insert(u);
+        }
+    }
+    assert(!curr.empty());
+
+    size_t i = HWLM_MASKLEN - lag - 1;
+    do {
+        if (curr.empty() || contains(curr, h.start)
+            || contains(curr, h.startDs)) {
+            DEBUG_PRINTF("end of the road\n");
+            break;
+        }
+
+        set<NFAVertex> next;
+        CharReach cr;
+        for (NFAVertex v : curr) {
+            const auto &v_cr = h[v].char_reach;
+            DEBUG_PRINTF("vertex %u, reach %s\n", h[v].index,
+                         describeClass(v_cr).c_str());
+            cr |= v_cr;
+            insert(&next, inv_adjacent_vertices(v, h));
+        }
+        make_and_cmp_mask(cr, &msk.at(i), &cmp.at(i));
+        DEBUG_PRINTF("%zu: reach=%s, msk=%u, cmp=%u\n", i,
+                     describeClass(cr).c_str(), msk[i], cmp[i]);
+        curr.swap(next);
+    } while (i-- > 0);
+
+    return true;
+}
+
+static
+bool maskFromLeftCastle(const LeftEngInfo &left, vector<u8> &msk,
+                        vector<u8> &cmp) {
+    const u32 lag = left.lag;
+    const ReportID report = left.leftfix_report;
+
+    DEBUG_PRINTF("leftfix with lag %u, report %u\n", lag, report);
+
+    assert(left.castle);
+    const CastleProto &c = *left.castle;
+
+    depth min_width(depth::infinity());
+    for (const PureRepeat &repeat : c.repeats | map_values) {
+        if (contains(repeat.reports, report)) {
+            min_width = min(min_width, repeat.bounds.min);
+        }
+    }
+
+    DEBUG_PRINTF("castle min width for this report is %s\n",
+                 min_width.str().c_str());
+
+    if (!min_width.is_finite() || min_width == depth(0)) {
+        DEBUG_PRINTF("bad min width\n");
+        return false;
+    }
+
+    u32 len = min_width;
+    u32 end = HWLM_MASKLEN - lag;
+    for (u32 i = end; i > end - min(end, len); i--) {
+        make_and_cmp_mask(c.reach(), &msk.at(i - 1), &cmp.at(i - 1));
+    }
+
+    return true;
+}
+
+static
+bool maskFromLeft(const LeftEngInfo &left, vector<u8> &msk, vector<u8> &cmp) {
+    if (left.lag >= HWLM_MASKLEN) {
+        DEBUG_PRINTF("too much lag\n");
+        return false;
+    }
+
+    if (left.graph) {
+        return maskFromLeftGraph(left, msk, cmp);
+    } else if (left.castle) {
+        return maskFromLeftCastle(left, msk, cmp);
+    }
+
+    return false;
+}
+
+static
+bool maskFromPreds(const RoseBuildImpl &build, const rose_literal_id &id,
+                   const RoseVertex v, vector<u8> &msk, vector<u8> &cmp) {
+    const RoseGraph &g = build.g;
+
+    // For right now, wuss out and only handle cases with one pred.
+    if (in_degree(v, g) != 1) {
+        return false;
+    }
+
+    // Root successors have no literal before them.
+    if (build.isRootSuccessor(v)) {
+        return false;
+    }
+
+    // If we have a single predecessor with a short bound, we may be able to
+    // fill out a mask with the trailing bytes of the previous literal. This
+    // allows us to improve literals like the 'bar' in 'fo.bar'.
+
+    RoseEdge e = *(in_edges(v, g).first);
+    u32 bound = g[e].maxBound;
+    if (bound != g[e].minBound || bound >= HWLM_MASKLEN) {
+        return false;
+    }
+
+    bound += id.s.length();
+    if (bound >= HWLM_MASKLEN) {
+        return false;
+    }
+
+    DEBUG_PRINTF("bound %u\n", bound);
+
+    RoseVertex u = source(e, g);
+    if (g[u].literals.size() != 1) {
+        DEBUG_PRINTF("u has %zu literals\n", g[u].literals.size());
+        return false;
+    }
+
+    u32 u_lit_id = *(g[u].literals.begin());
+    const rose_literal_id &u_id = build.literals.right.at(u_lit_id);
+    DEBUG_PRINTF("u has lit: %s\n", escapeString(u_id.s).c_str());
+
+    // Number of characters to take from the back of u's literal.
+    size_t u_len = u_id.s.length();
+    size_t u_sublen = min(u_len, (size_t)HWLM_MASKLEN - bound);
+
+    size_t i = HWLM_MASKLEN - (bound + u_sublen);
+
+    ue2_literal::const_iterator it, ite;
+    for (it = u_id.s.begin() + (u_len - u_sublen), ite = u_id.s.end();
+            it != ite; ++it) {
+        make_and_cmp_mask(*it, &msk.at(i), &cmp.at(i));
+        ++i;
+    }
+
+    return true;
+}
+
+static
+bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
+                     const rose_literal_info &info, const RoseVertex v,
+                     vector<u8> &msk, vector<u8> &cmp) {
+    // Start with zero masks.
+    msk.assign(HWLM_MASKLEN, 0);
+    cmp.assign(HWLM_MASKLEN, 0);
+
+    // Masks can come from literal benefits (for mixed-case literals).
+    if (info.requires_benefits) {
+        assert(mixed_sensitivity(id.s));
+
+        size_t j = 0;
+        for (ue2_literal::const_reverse_iterator it = id.s.rbegin(),
+                                                 ite = id.s.rend();
+             it != ite && j < HWLM_MASKLEN; ++it, ++j) {
+            size_t offset = HWLM_MASKLEN - j - 1;
+            const CharReach &cr = *it;
+            make_and_cmp_mask(cr, &msk[offset], &cmp[offset]);
+        }
+        return true;
+    }
+
+    const LeftEngInfo &left = build.g[v].left;
+    if (left && left.lag < HWLM_MASKLEN) {
+        if (maskFromLeft(left, msk, cmp)) {
+            DEBUG_PRINTF("mask from a leftfix!\n");
+            return true;
+        }
+    }
+
+    if (id.s.length() < HWLM_MASKLEN) {
+        if (maskFromPreds(build, id, v, msk, cmp)) {
+            DEBUG_PRINTF("mask from preds!\n");
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static
+bool hamsterMaskCombine(vector<u8> &msk, vector<u8> &cmp,
+                        const vector<u8> &v_msk, const vector<u8> &v_cmp) {
+    assert(msk.size() == HWLM_MASKLEN && cmp.size() == HWLM_MASKLEN);
+    assert(v_msk.size() == HWLM_MASKLEN && v_cmp.size() == HWLM_MASKLEN);
+
+    u8 all_masks = 0;
+
+    for (size_t i = 0; i < HWLM_MASKLEN; i++) {
+        u8 filter = ~(cmp[i] ^ v_cmp[i]);
+        msk[i] &= v_msk[i];
+        msk[i] &= filter;
+        cmp[i] &= filter;
+
+        all_masks |= msk[i];
+    }
+
+    // Return false if we have no bits on in any mask elements.
+    return all_masks != 0;
+}
+
+static
+bool findHamsterMask(const RoseBuildImpl &build, const rose_literal_id &id,
+                     const rose_literal_info &info,
+                     vector<u8> &msk, vector<u8> &cmp) {
+    if (!build.cc.grey.roseHamsterMasks) {
+        return false;
+    }
+
+    if (!info.delayed_ids.empty()) {
+        // Not safe to add masks to delayed literals at this late stage.
+        return false;
+    }
+
+    size_t num = 0;
+    vector<u8> v_msk, v_cmp;
+
+    for (RoseVertex v : info.vertices) {
+        if (!findHamsterMask(build, id, info, v, v_msk, v_cmp)) {
+            DEBUG_PRINTF("no mask\n");
+            return false;
+        }
+
+        if (!num++) {
+            // First (or only) vertex, this becomes the mask/cmp pair.
+            msk = v_msk;
+            cmp = v_cmp;
+        } else {
+            // Multiple vertices with potentially different masks. We combine
+            // them into an 'advisory' mask.
+            if (!hamsterMaskCombine(msk, cmp, v_msk, v_cmp)) {
+                DEBUG_PRINTF("mask went to zero\n");
+                return false;
+            }
+        }
+    }
+
+    normaliseLiteralMask(id.s, msk, cmp);
+
+    if (msk.empty()) {
+        DEBUG_PRINTF("no mask\n");
+        return false;
+    }
+
+    DEBUG_PRINTF("msk=%s, cmp=%s\n", dumpMask(msk).c_str(),
+                 dumpMask(cmp).c_str());
+    return true;
+}
+
+static
+bool isDirectHighlander(const RoseBuildImpl &build, const u32 id,
+                        const rose_literal_info &info) {
+    if (!build.isDirectReport(id)) {
+        return false;
+    }
+
+    auto is_simple_exhaustible = [&build](ReportID id) {
+        const Report &report = build.rm.getReport(id);
+        return isSimpleExhaustible(report);
+    };
+
+    assert(!info.vertices.empty());
+    for (const auto &v : info.vertices) {
+        const auto &reports = build.g[v].reports;
+        assert(!reports.empty());
+        if (!all_of(begin(reports), end(reports),
+                    is_simple_exhaustible)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Called by isNoRunsLiteral below.
+static
+bool isNoRunsVertex(const RoseBuildImpl &build, NFAVertex u) {
+    const RoseGraph &g = build.g;
+    if (!g[u].isBoring()) {
+        DEBUG_PRINTF("u=%zu is not boring\n", g[u].idx);
+        return false;
+    }
+
+    if (!g[u].reports.empty()) {
+        DEBUG_PRINTF("u=%zu has accept\n", g[u].idx);
+        return false;
+    }
+
+    /* TODO: handle non-root roles as well. It can't be that difficult... */
+
+    if (!in_degree_equal_to(u, g, 1)) {
+        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].idx);
+        return false;
+    }
+
+    RoseEdge e;
+    bool exists;
+    tie(e, exists) = edge_by_target(build.root, u, g);
+
+    if (!exists) {
+        DEBUG_PRINTF("u=%zu is not a root role\n", g[u].idx);
+        return false;
+    }
+
+    if (g[e].minBound != 0 || g[e].maxBound != ROSE_BOUND_INF) {
+        DEBUG_PRINTF("u=%zu has bounds from root\n", g[u].idx);
+        return false;
+    }
+
+    for (const auto &oe : out_edges_range(u, g)) {
+        RoseVertex v = target(oe, g);
+        if (g[oe].maxBound != ROSE_BOUND_INF) {
+            DEBUG_PRINTF("edge (%zu,%zu) has max bound\n", g[u].idx,
+                    g[target(oe, g)].idx);
+            return false;
+        }
+        if (g[v].left) {
+            DEBUG_PRINTF("v=%zu has rose prefix\n", g[v].idx);
+            return false;
+        }
+    }
+    return true;
+}
+
+static
+bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
+                     const rose_literal_info &info) {
+    DEBUG_PRINTF("lit id %u\n", id);
+
+    if (info.requires_benefits) {
+        DEBUG_PRINTF("requires benefits\n"); // which would need confirm
+        return false;
+    }
+
+    if (isDirectHighlander(build, id, info)) {
+        DEBUG_PRINTF("highlander direct report\n");
+        return true;
+    }
+
+    // Undelayed vertices.
+    for (RoseVertex v : info.vertices) {
+        if (!isNoRunsVertex(build, v)) {
+            return false;
+        }
+    }
+
+    // Delayed vertices.
+    for (u32 d : info.delayed_ids) {
+        assert(d < build.literal_info.size());
+        const rose_literal_info &delayed_info = build.literal_info.at(d);
+        assert(delayed_info.undelayed_id == id);
+        for (RoseVertex v : delayed_info.vertices) {
+            if (!isNoRunsVertex(build, v)) {
+                return false;
+            }
+        }
+    }
+
+    DEBUG_PRINTF("is no-runs literal\n");
+    return true;
+}
+
+vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
+                                           rose_literal_table table) {
+    vector<hwlmLiteral> lits;
+
+    for (const auto &e : build.literals.right) {
+        const u32 id = e.first;
+        if (!build.hasFinalId(id)) {
+            continue;
+        }
+
+        if (e.second.delay) {
+            continue; /* delay id's are virtual-ish */
+        }
+
+        if (e.second.table != table) {
+            continue; /* wrong table */
+        }
+
+        assert(id < build.literal_info.size());
+        const rose_literal_info &info = build.literal_info[id];
+        u32 final_id = info.final_id;
+        rose_group groups = info.group_mask;
+        /* Note: requires_benefits are handled in the literal entries */
+        const ue2_literal &lit = e.second.s;
+
+        DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
+
+        vector<u8> msk = e.second.msk; // copy
+        vector<u8> cmp = e.second.cmp; // copy
+
+        if (msk.empty()) {
+            // Try and pick up an advisory mask.
+            if (!findHamsterMask(build, e.second, info, msk, cmp)) {
+                msk.clear(); cmp.clear();
+            } else {
+                DEBUG_PRINTF("picked up late mask %zu\n", msk.size());
+            }
+        }
+
+        bool noruns = isNoRunsLiteral(build, id, info);
+
+        if (info.requires_explode) {
+            DEBUG_PRINTF("exploding lit\n");
+            const vector<u8> empty_msk; // msk/cmp will be empty
+            case_iter cit = caseIterateBegin(lit);
+            case_iter cite = caseIterateEnd();
+            for (; cit != cite; ++cit) {
+                DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d msk=%s, "
+                             "cmp=%s (exploded)\n",
+                             final_id, escapeString(lit.get_string()).c_str(),
+                             0, noruns, dumpMask(msk).c_str(),
+                             dumpMask(cmp).c_str());
+                lits.emplace_back(*cit, false, noruns, final_id, groups,
+                                  empty_msk, empty_msk);
+            }
+        } else {
+            const std::string &s = lit.get_string();
+            const bool nocase = lit.any_nocase();
+
+            DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
+                         "cmp=%s\n",
+                         final_id, escapeString(s).c_str(), (int)nocase, noruns,
+                         dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+
+            if (!maskIsConsistent(s, nocase, msk, cmp)) {
+                DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+                continue;
+            }
+
+            lits.emplace_back(lit.get_string(), lit.any_nocase(), noruns,
+                              final_id, groups, msk, cmp);
+        }
+    }
+
+    return lits;
+}
+
+aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              size_t *fsize,
+                                              size_t *historyRequired,
+                                              size_t *streamStateRequired) {
+    *fsize = 0;
+
+    auto fl = fillHamsterLiteralList(build, ROSE_FLOATING);
+    if (fl.empty()) {
+        DEBUG_PRINTF("empty floating matcher\n");
+        return nullptr;
+    }
+
+    hwlmStreamingControl ctl;
+    hwlmStreamingControl *ctlp;
+    if (build.cc.streaming) {
+        ctl.history_max = build.cc.grey.maxHistoryAvailable;
+        ctl.history_min = MAX(*historyRequired,
+                              build.cc.grey.minHistoryAvailable);
+        DEBUG_PRINTF("streaming control, history max=%zu, min=%zu\n",
+                     ctl.history_max, ctl.history_min);
+        ctlp = &ctl;
+    } else {
+        ctlp = nullptr; // Null for non-streaming.
+    }
+
+    aligned_unique_ptr<HWLM> ftable =
+        hwlmBuild(fl, ctlp, false, build.cc, build.getInitialGroups());
+    if (!ftable) {
+        throw CompileError("Unable to generate bytecode.");
+    }
+
+    if (build.cc.streaming) {
+        DEBUG_PRINTF("literal_history_required=%zu\n",
+                ctl.literal_history_required);
+        DEBUG_PRINTF("literal_stream_state_required=%zu\n",
+                ctl.literal_stream_state_required);
+        assert(ctl.literal_history_required <=
+               build.cc.grey.maxHistoryAvailable);
+        *historyRequired = max(*historyRequired,
+                ctl.literal_history_required);
+        *streamStateRequired = ctl.literal_stream_state_required;
+    }
+
+    *fsize = hwlmSize(ftable.get());
+    assert(*fsize);
+    DEBUG_PRINTF("built floating literal table size %zu bytes\n", *fsize);
+    return ftable;
+}
+
+aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
+                                                size_t *sbsize) {
+    *sbsize = 0;
+
+    if (build.cc.streaming) {
+        DEBUG_PRINTF("streaming mode\n");
+        return nullptr;
+    }
+
+    u32 float_min = findMinWidth(build, ROSE_FLOATING);
+    if (float_min > ROSE_SMALL_BLOCK_LEN) {
+        DEBUG_PRINTF("floating table has large min width %u, fail\n",
+                     float_min);
+        return nullptr;
+    }
+
+    auto lits = fillHamsterLiteralList(build, ROSE_FLOATING);
+    if (lits.empty()) {
+        DEBUG_PRINTF("no floating table\n");
+        return nullptr;
+    } else if (lits.size() == 1) {
+        DEBUG_PRINTF("single floating literal, noodle will be fast enough\n");
+        return nullptr;
+    }
+
+    auto anchored_lits =
+        fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK);
+    if (anchored_lits.empty()) {
+        DEBUG_PRINTF("no small-block anchored literals\n");
+        return nullptr;
+    }
+
+    lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end());
+
+    // Remove literals that are longer than our small block length, as they can
+    // never match. TODO: improve by removing literals that have a min match
+    // offset greater than ROSE_SMALL_BLOCK_LEN, which will catch anchored cases
+    // with preceding dots that put them over the limit.
+    auto longer_than_limit = [](const hwlmLiteral &lit) {
+        return lit.s.length() > ROSE_SMALL_BLOCK_LEN;
+    };
+    lits.erase(remove_if(lits.begin(), lits.end(), longer_than_limit),
+               lits.end());
+
+    if (lits.empty()) {
+        DEBUG_PRINTF("no literals shorter than small block len\n");
+        return nullptr;
+    }
+
+    aligned_unique_ptr<HWLM> hwlm =
+        hwlmBuild(lits, nullptr, true, build.cc, build.getInitialGroups());
+    if (!hwlm) {
+        throw CompileError("Unable to generate bytecode.");
+    }
+
+    *sbsize = hwlmSize(hwlm.get());
+    assert(*sbsize);
+    DEBUG_PRINTF("built small block literal table size %zu bytes\n", *sbsize);
+    return hwlm;
+}
+
+aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
+                                                 size_t *esize) {
+    *esize = 0;
+
+    auto el = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED);
+
+    if (el.empty()) {
+        DEBUG_PRINTF("no eod anchored literals\n");
+        assert(!build.ematcher_region_size);
+        return nullptr;
+    }
+
+    assert(build.ematcher_region_size);
+
+    hwlmStreamingControl *ctlp = nullptr; // not a streaming case
+    aligned_unique_ptr<HWLM> etable =
+        hwlmBuild(el, ctlp, true, build.cc, build.getInitialGroups());
+    if (!etable) {
+        throw CompileError("Unable to generate bytecode.");
+    }
+
+    *esize = hwlmSize(etable.get());
+    assert(*esize);
+    DEBUG_PRINTF("built eod-anchored literal table size %zu bytes\n", *esize);
+    return etable;
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
new file mode 100644 (file)
index 0000000..9781f51
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Rose build: code for constructing literal tables.
+ */
+
+#ifndef ROSE_BUILD_MATCHERS_H
+#define ROSE_BUILD_MATCHERS_H
+
+#include "rose_build_impl.h"
+
+#include <vector>
+
+struct HWLM;
+
+namespace ue2 {
+
+struct hwlmLiteral;
+
+std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
+                                                rose_literal_table table);
+
+aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                              size_t *fsize,
+                                              size_t *historyRequired,
+                                              size_t *streamStateRequired);
+
+aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
+                                                size_t *sbsize);
+
+aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
+                                                 size_t *esize);
+
+} // namespace ue2
+
+#endif // ROSE_BUILD_MATCHERS_H