From: Vsevolod Stakhov Date: Tue, 25 Nov 2025 17:13:19 +0000 (+0000) Subject: [Feature] Add inverted index for composites optimization X-Git-Tag: 3.14.1~5^2~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=6c19888b9b4bbceb29eb0b244a33d8a095751c60;p=thirdparty%2Frspamd.git [Feature] Add inverted index for composites optimization Build an inverted index mapping symbol names to composites that contain those symbols as positive (non-negated) atoms. This allows filtering out composites that cannot possibly match during the first pass evaluation. - Add rspamd_expression_atom_foreach_ex() to traverse expression atoms with access to AST nodes (needed to detect negated atoms) - Add rspamd_expression_node_is_op() to check if a node is an operator - Build inverted index in composites_manager during config processing - Track composites with only negated atoms separately (they must always be evaluated) - Use inverted index in composites_metric_callback for first pass to evaluate only potentially matching composites For configurations with many composites (4000+), this reduces the number of composites evaluated per message from all to only those that have at least one matching symbol present. --- diff --git a/src/libserver/composites/composites.cxx b/src/libserver/composites/composites.cxx index 6e7e435a4e..b9ab013da2 100644 --- a/src/libserver/composites/composites.cxx +++ b/src/libserver/composites/composites.cxx @@ -991,12 +991,47 @@ composites_metric_callback(struct rspamd_task *task) } } else { - /* First pass: use symcache iteration (will skip second-pass composites in callback) */ - msg_debug_composites("processing first-pass composites via symcache"); - rspamd_symcache_composites_foreach(task, - task->cfg->cache, - composites_foreach_callback, - &cd); + /* First pass: use inverted index for fast lookup */ + ankerl::unordered_dense::set potentially_active; + + /* Callback data for collecting potentially active composites */ + struct collect_active_cbdata { + composites_manager *cm; + ankerl::unordered_dense::set *active; + } collect_data{cm, &potentially_active}; + + /* Collect composites that have at least one positive atom present */ + rspamd_task_symbol_result_foreach(task, mres, [](gpointer key, gpointer value, gpointer ud) { + auto *cbd = reinterpret_cast(ud); + std::string_view sym_name{reinterpret_cast(key)}; + + auto it = cbd->cm->symbol_to_composites.find(sym_name); + if (it != cbd->cm->symbol_to_composites.end()) { + for (auto *comp: it->second) { + /* Only add first-pass composites */ + if (!comp->second_pass) { + cbd->active->insert(comp); + } + } + } }, &collect_data); + + /* Always add NOT-only composites (they have no positive atoms) */ + for (auto *comp: cm->not_only_composites) { + if (!comp->second_pass) { + potentially_active.insert(comp); + } + } + + msg_debug_composites("processing %d potentially active composites (from %d first-pass)", + (int) potentially_active.size(), + (int) cm->first_pass_composites.size()); + + /* Process only potentially active composites */ + for (auto *comp: potentially_active) { + composites_foreach_callback((gpointer) comp->sym.c_str(), + (gpointer) comp, + &cd); + } } } diff --git a/src/libserver/composites/composites_internal.hxx b/src/libserver/composites/composites_internal.hxx index 09ff48e8d0..032747e872 100644 --- a/src/libserver/composites/composites_internal.hxx +++ b/src/libserver/composites/composites_internal.hxx @@ -47,7 +47,8 @@ struct rspamd_composite { struct rspamd_expression *expr; int id; rspamd_composite_policy policy; - bool second_pass; /**< true if this composite needs second pass evaluation */ + bool second_pass; /**< true if this composite needs second pass evaluation */ + bool has_positive_atoms; /**< true if composite has at least one non-negated atom */ }; #define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast(ptr)) @@ -114,8 +115,17 @@ public: std::vector first_pass_composites; /* Evaluated during COMPOSITES stage */ std::vector second_pass_composites; /* Evaluated during COMPOSITES_POST stage */ + /* Inverted index: symbol -> composites that contain this symbol as positive atom */ + ankerl::unordered_dense::map, + rspamd::smart_str_hash, rspamd::smart_str_equal> + symbol_to_composites; + /* Composites that have only negated atoms (must always be checked) */ + std::vector not_only_composites; + /* Analyze composite dependencies and split into first/second pass vectors */ void process_dependencies(); + /* Build inverted index for fast composite lookup */ + void build_inverted_index(); }; }// namespace rspamd::composites diff --git a/src/libserver/composites/composites_manager.cxx b/src/libserver/composites/composites_manager.cxx index c1bc1d94a2..57ae175cfb 100644 --- a/src/libserver/composites/composites_manager.cxx +++ b/src/libserver/composites/composites_manager.cxx @@ -472,10 +472,78 @@ void composites_manager::process_dependencies() (int) first_pass_composites.size(), (int) second_pass_composites.size()); } +/* Context for building inverted index */ +struct inverted_index_cbdata { + composites_manager *cm; + rspamd_composite *comp; + bool has_positive; +}; + +static void +inverted_index_atom_callback(GNode *atom_node, rspamd_expression_atom_t *atom, gpointer ud) +{ + auto *cbd = reinterpret_cast(ud); + + /* Check if this atom is under NOT operation */ + if (atom_node->parent && rspamd_expression_node_is_op(atom_node->parent, OP_NOT)) { + /* Negated atom - don't add to inverted index */ + return; + } + + /* Extract normalized symbol name from atom string */ + std::string_view atom_str(atom->str, atom->len); + + /* Skip special characters and find the actual symbol name */ + /* Atom format: [~-^]SYMBOL[options] */ + auto start = atom_str.begin(); + while (start != atom_str.end() && (*start == '~' || *start == '-' || *start == '^')) { + ++start; + } + + /* Find end of symbol name (before '[' if present) */ + auto end = std::find(start, atom_str.end(), '['); + + if (start >= end) { + return; /* Empty or invalid symbol */ + } + + std::string symbol_name(start, end); + + /* Mark that we have at least one positive atom */ + cbd->has_positive = true; + + /* Add to inverted index */ + cbd->cm->symbol_to_composites[symbol_name].push_back(cbd->comp); +} + +void composites_manager::build_inverted_index() +{ + msg_debug_config("building inverted index for %d composites", (int) all_composites.size()); + + for (auto &comp: all_composites) { + inverted_index_cbdata cbd{this, comp.get(), false}; + + rspamd_expression_atom_foreach_ex(comp->expr, inverted_index_atom_callback, &cbd); + + comp->has_positive_atoms = cbd.has_positive; + + if (!cbd.has_positive) { + /* Composite with only negated atoms - must always be checked */ + not_only_composites.push_back(comp.get()); + msg_debug_config("composite '%s' has only negated atoms, will always be checked", + comp->sym.c_str()); + } + } + + msg_debug_config("inverted index built: %d unique symbols, %d not-only composites", + (int) symbol_to_composites.size(), (int) not_only_composites.size()); +} + }// namespace rspamd::composites void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg) { auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr); cm->process_dependencies(); + cm->build_inverted_index(); } \ No newline at end of file diff --git a/src/libutil/expression.c b/src/libutil/expression.c index cac7594d6a..5119694ce3 100644 --- a/src/libutil/expression.c +++ b/src/libutil/expression.c @@ -1737,6 +1737,37 @@ void rspamd_expression_atom_foreach(struct rspamd_expression *expr, rspamd_ast_atom_traverse, &data); } +struct atom_foreach_cbdata_ex { + rspamd_expression_atom_foreach_cb_ex cb; + gpointer cbdata; +}; + +static gboolean +rspamd_ast_atom_traverse_ex(GNode *n, gpointer d) +{ + struct atom_foreach_cbdata_ex *data = d; + struct rspamd_expression_elt *elt = n->data; + + if (elt->type == ELT_ATOM) { + data->cb(n, elt->p.atom, data->cbdata); + } + + return FALSE; +} + +void rspamd_expression_atom_foreach_ex(struct rspamd_expression *expr, + rspamd_expression_atom_foreach_cb_ex cb, gpointer cbdata) +{ + struct atom_foreach_cbdata_ex data; + + g_assert(expr != NULL); + + data.cb = cb; + data.cbdata = cbdata; + g_node_traverse(expr->ast, G_POST_ORDER, G_TRAVERSE_ALL, -1, + rspamd_ast_atom_traverse_ex, &data); +} + gboolean rspamd_expression_node_is_op(GNode *node, enum rspamd_expression_op op) { diff --git a/src/libutil/expression.h b/src/libutil/expression.h index 44a793e46f..5721b883c3 100644 --- a/src/libutil/expression.h +++ b/src/libutil/expression.h @@ -158,6 +158,22 @@ typedef void (*rspamd_expression_atom_foreach_cb)(const rspamd_ftok_t *atom, void rspamd_expression_atom_foreach(struct rspamd_expression *expr, rspamd_expression_atom_foreach_cb cb, gpointer cbdata); +/** + * Extended callback that provides access to the AST node (for checking parent operations like NOT) + */ +typedef void (*rspamd_expression_atom_foreach_cb_ex)(GNode *atom_node, + rspamd_expression_atom_t *atom, + gpointer ud); + +/** + * Traverse over all atoms in the expression with access to AST nodes + * @param expr expression + * @param cb callback to be called with GNode and full atom structure + * @param ud opaque data passed to `cb` + */ +void rspamd_expression_atom_foreach_ex(struct rspamd_expression *expr, + rspamd_expression_atom_foreach_cb_ex cb, gpointer cbdata); + /** * Checks if a specified node in AST is the specified operation * @param node AST node packed in GNode container