* ``min_length``: The minimum match length (from start to end) required to
successfully match this expression.
* ``edit_distance``: Match this expression within a given Levenshtein distance.
+* ``hamming_distance``: Match this expression within a given Hamming distance.
These parameters either allow the set of matches produced by a pattern to be
constrained at compile time (rather than relying on the application to process
streams ``foo0123bar`` or ``foo0123456bar``.
Similarly, the pattern :regexp:`/foobar/` when given an ``edit_distance`` of 2
-will produce matches when scanned against ``foobar``, ``fooba``, ``fobr``,
-``fo_baz``, ``foooobar``, and anything else that lies within edit distance of 2
-(as defined by Levenshtein distance). For more details, see the
-:ref:`approximate_matching` section.
+will produce matches when scanned against ``foobar``, ``f00bar``, ``fooba``,
+``fobr``, ``fo_baz``, ``foooobar``, and anything else that lies within edit
+distance of 2 (as defined by Levenshtein distance).
+
+When the same pattern :regexp:`/foobar/` is given a ``hamming_distance`` of 2,
+it will produce matches when scanned against ``foobar``, ``boofar``,
+``f00bar``, and anything else with at most two characters substituted from the
+original pattern. For more details, see the :ref:`approximate_matching`
+section.
=================
Prefiltering Mode
#. **Edit distance** is defined as Levenshtein distance. That is, there are
three possible edit types considered: insertion, removal and substitution.
- More formal description can be found on
+ A more formal description can be found on
`Wikipedia <https://en.wikipedia.org/wiki/Levenshtein_distance>`_.
-#. **Approximate matching** will match all *corpora* within a given edit
- distance. That is, given a pattern, approximate matching will match anything
- that can be edited to arrive at a corpus that exactly matches the original
- pattern.
+#. **Hamming distance** is the number of positions by which two strings of
+ equal length differ. That is, it is the number of substitutions required to
+ convert one string to the other. There are no insertions or removals when
+ approximate matching using a Hamming distance. A more formal description can
+ be found on
+ `Wikipedia <https://en.wikipedia.org/wiki/Hamming_distance>`_.
+
+#. **Approximate matching** will match all *corpora* within a given edit or
+ Hamming distance. That is, given a pattern, approximate matching will match
+ anything that can be edited to arrive at a corpus that exactly matches the
+ original pattern.
#. **Matching semantics** are exactly the same as described in :ref:`semantics`.
reduce to so-called "vacuous" patterns (patterns that match everything). For
example, pattern :regexp:`/foo/` with edit distance 3, if implemented,
would reduce to matching zero-length buffers. Such patterns will result in a
- "Pattern cannot be approximately matched" compile error.
+ "Pattern cannot be approximately matched" compile error. Approximate
+ matching within a Hamming distance does not remove symbols, so will not
+ reduce to a vacuous pattern.
* Finally, due to the inherent complexities of defining matching behavior,
approximate matching implements a reduced subset of regular expression
syntax. Approximate matching does not support UTF-8 (and other
struct ShadowGraph {
NGHolder &g;
u32 edit_distance;
+ bool hamming;
map<pair<NFAVertex, u32>, NFAVertex> shadow_map;
map<pair<NFAVertex, u32>, NFAVertex> helper_map;
map<NFAVertex, NFAVertex> clones;
vector<pair<NFAVertex, NFAVertex>> edges_to_be_added;
flat_set<NFAVertex> orig;
- ShadowGraph(NGHolder &g_in, u32 ed_in) : g(g_in), edit_distance(ed_in) {}
+ ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in)
+ : g(g_in), edit_distance(ed_in), hamming(hamm_in) {}
void fuzz_graph() {
if (edit_distance == 0) {
return;
}
+ DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance,
+ hamming ? "true" : "false");
+
// step 1: prepare the vertices, helpers and shadows according to
// the original graph
prepare_graph();
// step 3: set up reports for newly created vertices (and make clones
// if necessary)
- create_reports();
+ if (!hamming) {
+ create_reports();
+ }
// step 4: wire up shadow graph and helpers for insert/replace/remove
connect_shadow_graph();
// if there's nowhere to go from this vertex, no helper needed
if (proper_out_degree(v, g) < 1) {
+ DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
+ g[shadow_v].index, dist);
+ helper_map[make_pair(v, dist)] = shadow_v;
+ continue;
+ }
+
+ // start and startDs only have helpers for insert, so not Hamming
+ if (hamming && is_any_start(v, g)) {
+ DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n",
+ g[shadow_v].index, dist);
helper_map[make_pair(v, dist)] = shadow_v;
continue;
}
g[helper_v].char_reach = CharReach::dot();
// do not copy virtual start's assert flags
if (is_virtual_start(v, g)) {
+ DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n",
+ g[helper_v].index, dist);
g[helper_v].assert_flags = 0;
}
helper_map[make_pair(v, dist)] = helper_v;
const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
// multiple insert
- if (dist > 1) {
+ if (!hamming && dist > 1) {
const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)];
connect_to_clones(prev_level_helper, cur_shadow_helper);
}
connect_preds(v, dist);
// handle helpers
- if (dist > 0) {
+ if (!hamming && dist > 0) {
connect_helpers(v, dist);
}
}
// handle removals
- connect_removals(v);
+ if (!hamming) {
+ connect_removals(v);
+ }
}
}
return false;
}
-void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
- const Grey &grey) {
+void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming,
+ bool utf8, const Grey &grey) {
if (edit_distance == 0) {
return;
}
"approximate matching.");
}
}
- if (will_turn_vacuous(g, edit_distance)) {
+ if (!hamming && will_turn_vacuous(g, edit_distance)) {
throw CompileError("Approximate matching patterns that reduce to "
"vacuous patterns are disallowed.");
}
}
-void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
+void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming,
+ const Grey &grey) {
if (edit_distance == 0) {
return;
}
assert(grey.allowApproximateMatching);
assert(grey.maxEditDistance >= edit_distance);
- ShadowGraph sg(g, edit_distance);
+ ShadowGraph sg(g, edit_distance, hamming);
sg.fuzz_graph();
// For safety, enforce limit on actual vertex count.
os << "edit_distance=" << ext.edit_distance;
first = false;
}
+ if (ext.flags & HS_EXT_FLAG_HAMMING_DISTANCE) {
+ if (!first) {
+ os << ", ";
+ }
+ os << "hamming_distance=" << ext.hamming_distance;
+ first = false;
+ }
return os;
}
free(info);
}
-static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0 };
+static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0, 0 };
static const expected_info ei_test[] = {
{"abc", NO_EXT_PARAM, 3, 3, 0, 0, 0},
{"(foo|bar)\\z", NO_EXT_PARAM, 3, 3, 0, 1, 1},
// Some cases with extended parameters.
- {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
- {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0},
-
- {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
+ {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0, 0}, 6, 10, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0, 0}, 100, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0, 0}, 6, UINT_MAX, 0, 0, 0},
+
+ {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0},
10, UINT_MAX, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
+ {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, UINT_MAX, 0, 0, 0},
- {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
+ {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
- {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
- {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
- {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
+ {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2, 0},
10, UINT_MAX, 0, 0, 0},
- {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
+ {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, UINT_MAX, 0, 0, 0},
- {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
+ {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
- {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0},
- {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0},
- {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2},
+ {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1, 0}, 5, 7, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2, 0}, 4, 8, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2, 0},
8, 8, 0, 0, 0},
- {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
+ {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2, 0},
4, 8, 0, 0, 0},
- {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
+ {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2, 0},
4, 6, 0, 0, 0},
+
+ {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2},
+ 10, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
+ 6, UINT_MAX, 0, 0, 0},
+ {"abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
+ 6, 6, 0, 0, 0},
+
+ {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 0, 2},
+ 10, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
+ 6, UINT_MAX, 0, 0, 0},
+ {"^abc.*def", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
+ 6, 6, 0, 0, 0},
+
+ {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 1}, 6, 6, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 2}, 6, 6, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE, 0, 0, 0, 0, 5}, 6, 6, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 6, 0, 2},
+ 6, 6, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 0, 2},
+ 6, 6, 0, 0, 0},
+ {"^abcdef", {HS_EXT_FLAG_HAMMING_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 0, 2},
+ 6, 6, 0, 0, 0},
};
INSTANTIATE_TEST_CASE_P(ExprInfo, ExprInfop, ValuesIn(ei_test));