]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add inverted index for composites optimization
authorVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Nov 2025 17:13:19 +0000 (17:13 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Nov 2025 17:13:19 +0000 (17:13 +0000)
Build an inverted index mapping symbol names to composites that contain
those symbols as positive (non-negated) atoms. This allows filtering out
composites that cannot possibly match during the first pass evaluation.

- Add rspamd_expression_atom_foreach_ex() to traverse expression atoms
  with access to AST nodes (needed to detect negated atoms)
- Add rspamd_expression_node_is_op() to check if a node is an operator
- Build inverted index in composites_manager during config processing
- Track composites with only negated atoms separately (they must always
  be evaluated)
- Use inverted index in composites_metric_callback for first pass to
  evaluate only potentially matching composites

For configurations with many composites (4000+), this reduces the number
of composites evaluated per message from all to only those that have at
least one matching symbol present.

src/libserver/composites/composites.cxx
src/libserver/composites/composites_internal.hxx
src/libserver/composites/composites_manager.cxx
src/libutil/expression.c
src/libutil/expression.h

index 6e7e435a4e4f5a9d487a630671e4eec1f5ff6b1c..b9ab013da22e6d762309b3c0878ce7682083b854 100644 (file)
@@ -991,12 +991,47 @@ composites_metric_callback(struct rspamd_task *task)
                        }
                }
                else {
-                       /* First pass: use symcache iteration (will skip second-pass composites in callback) */
-                       msg_debug_composites("processing first-pass composites via symcache");
-                       rspamd_symcache_composites_foreach(task,
-                                                                                          task->cfg->cache,
-                                                                                          composites_foreach_callback,
-                                                                                          &cd);
+                       /* First pass: use inverted index for fast lookup */
+                       ankerl::unordered_dense::set<rspamd_composite *> potentially_active;
+
+                       /* Callback data for collecting potentially active composites */
+                       struct collect_active_cbdata {
+                               composites_manager *cm;
+                               ankerl::unordered_dense::set<rspamd_composite *> *active;
+                       } collect_data{cm, &potentially_active};
+
+                       /* Collect composites that have at least one positive atom present */
+                       rspamd_task_symbol_result_foreach(task, mres, [](gpointer key, gpointer value, gpointer ud) {
+                                                                                                 auto *cbd = reinterpret_cast<collect_active_cbdata *>(ud);
+                                                                                                 std::string_view sym_name{reinterpret_cast<const char *>(key)};
+
+                                                                                                 auto it = cbd->cm->symbol_to_composites.find(sym_name);
+                                                                                                 if (it != cbd->cm->symbol_to_composites.end()) {
+                                                                                                         for (auto *comp: it->second) {
+                                                                                                                 /* Only add first-pass composites */
+                                                                                                                 if (!comp->second_pass) {
+                                                                                                                         cbd->active->insert(comp);
+                                                                                                                 }
+                                                                                                         }
+                                                                                                 } }, &collect_data);
+
+                       /* Always add NOT-only composites (they have no positive atoms) */
+                       for (auto *comp: cm->not_only_composites) {
+                               if (!comp->second_pass) {
+                                       potentially_active.insert(comp);
+                               }
+                       }
+
+                       msg_debug_composites("processing %d potentially active composites (from %d first-pass)",
+                                                                (int) potentially_active.size(),
+                                                                (int) cm->first_pass_composites.size());
+
+                       /* Process only potentially active composites */
+                       for (auto *comp: potentially_active) {
+                               composites_foreach_callback((gpointer) comp->sym.c_str(),
+                                                                                       (gpointer) comp,
+                                                                                       &cd);
+                       }
                }
        }
 
index 09ff48e8d0c2ba8afaf9a1d50e760c48d4efdc99..032747e87263512cdf6e68e7a7edb12f8b8cc8a4 100644 (file)
@@ -47,7 +47,8 @@ struct rspamd_composite {
        struct rspamd_expression *expr;
        int id;
        rspamd_composite_policy policy;
-       bool second_pass; /**< true if this composite needs second pass evaluation */
+       bool second_pass;        /**< true if this composite needs second pass evaluation */
+       bool has_positive_atoms; /**< true if composite has at least one non-negated atom */
 };
 
 #define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast<rspamd::composites::composites_manager *>(ptr))
@@ -114,8 +115,17 @@ public:
        std::vector<rspamd_composite *> first_pass_composites;  /* Evaluated during COMPOSITES stage */
        std::vector<rspamd_composite *> second_pass_composites; /* Evaluated during COMPOSITES_POST stage */
 
+       /* Inverted index: symbol -> composites that contain this symbol as positive atom */
+       ankerl::unordered_dense::map<std::string, std::vector<rspamd_composite *>,
+                                                                rspamd::smart_str_hash, rspamd::smart_str_equal>
+               symbol_to_composites;
+       /* Composites that have only negated atoms (must always be checked) */
+       std::vector<rspamd_composite *> not_only_composites;
+
        /* Analyze composite dependencies and split into first/second pass vectors */
        void process_dependencies();
+       /* Build inverted index for fast composite lookup */
+       void build_inverted_index();
 };
 
 }// namespace rspamd::composites
index c1bc1d94a29fc02dfa7bca822d6de3c837f7625f..57ae175cfb0d14e80983faf5a710a638bf04a224 100644 (file)
@@ -472,10 +472,78 @@ void composites_manager::process_dependencies()
                                         (int) first_pass_composites.size(), (int) second_pass_composites.size());
 }
 
+/* Context for building inverted index */
+struct inverted_index_cbdata {
+       composites_manager *cm;
+       rspamd_composite *comp;
+       bool has_positive;
+};
+
+static void
+inverted_index_atom_callback(GNode *atom_node, rspamd_expression_atom_t *atom, gpointer ud)
+{
+       auto *cbd = reinterpret_cast<inverted_index_cbdata *>(ud);
+
+       /* Check if this atom is under NOT operation */
+       if (atom_node->parent && rspamd_expression_node_is_op(atom_node->parent, OP_NOT)) {
+               /* Negated atom - don't add to inverted index */
+               return;
+       }
+
+       /* Extract normalized symbol name from atom string */
+       std::string_view atom_str(atom->str, atom->len);
+
+       /* Skip special characters and find the actual symbol name */
+       /* Atom format: [~-^]SYMBOL[options] */
+       auto start = atom_str.begin();
+       while (start != atom_str.end() && (*start == '~' || *start == '-' || *start == '^')) {
+               ++start;
+       }
+
+       /* Find end of symbol name (before '[' if present) */
+       auto end = std::find(start, atom_str.end(), '[');
+
+       if (start >= end) {
+               return; /* Empty or invalid symbol */
+       }
+
+       std::string symbol_name(start, end);
+
+       /* Mark that we have at least one positive atom */
+       cbd->has_positive = true;
+
+       /* Add to inverted index */
+       cbd->cm->symbol_to_composites[symbol_name].push_back(cbd->comp);
+}
+
+void composites_manager::build_inverted_index()
+{
+       msg_debug_config("building inverted index for %d composites", (int) all_composites.size());
+
+       for (auto &comp: all_composites) {
+               inverted_index_cbdata cbd{this, comp.get(), false};
+
+               rspamd_expression_atom_foreach_ex(comp->expr, inverted_index_atom_callback, &cbd);
+
+               comp->has_positive_atoms = cbd.has_positive;
+
+               if (!cbd.has_positive) {
+                       /* Composite with only negated atoms - must always be checked */
+                       not_only_composites.push_back(comp.get());
+                       msg_debug_config("composite '%s' has only negated atoms, will always be checked",
+                                                        comp->sym.c_str());
+               }
+       }
+
+       msg_debug_config("inverted index built: %d unique symbols, %d not-only composites",
+                                        (int) symbol_to_composites.size(), (int) not_only_composites.size());
+}
+
 }// namespace rspamd::composites
 
 void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg)
 {
        auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
        cm->process_dependencies();
+       cm->build_inverted_index();
 }
\ No newline at end of file
index cac7594d6a179e5d6bdf8ac3ef7ea2516f8eaf15..5119694ce38c70e02e5f2c2f989fb2fe2b947178 100644 (file)
@@ -1737,6 +1737,37 @@ void rspamd_expression_atom_foreach(struct rspamd_expression *expr,
                                        rspamd_ast_atom_traverse, &data);
 }
 
+struct atom_foreach_cbdata_ex {
+       rspamd_expression_atom_foreach_cb_ex cb;
+       gpointer cbdata;
+};
+
+static gboolean
+rspamd_ast_atom_traverse_ex(GNode *n, gpointer d)
+{
+       struct atom_foreach_cbdata_ex *data = d;
+       struct rspamd_expression_elt *elt = n->data;
+
+       if (elt->type == ELT_ATOM) {
+               data->cb(n, elt->p.atom, data->cbdata);
+       }
+
+       return FALSE;
+}
+
+void rspamd_expression_atom_foreach_ex(struct rspamd_expression *expr,
+                                                                          rspamd_expression_atom_foreach_cb_ex cb, gpointer cbdata)
+{
+       struct atom_foreach_cbdata_ex data;
+
+       g_assert(expr != NULL);
+
+       data.cb = cb;
+       data.cbdata = cbdata;
+       g_node_traverse(expr->ast, G_POST_ORDER, G_TRAVERSE_ALL, -1,
+                                       rspamd_ast_atom_traverse_ex, &data);
+}
+
 gboolean
 rspamd_expression_node_is_op(GNode *node, enum rspamd_expression_op op)
 {
index 44a793e46f8b79e61f1517dce65177710cc5fcf3..5721b883c3ccfc7e445905ef9315a18acbdcba3e 100644 (file)
@@ -158,6 +158,22 @@ typedef void (*rspamd_expression_atom_foreach_cb)(const rspamd_ftok_t *atom,
 void rspamd_expression_atom_foreach(struct rspamd_expression *expr,
                                                                        rspamd_expression_atom_foreach_cb cb, gpointer cbdata);
 
+/**
+ * Extended callback that provides access to the AST node (for checking parent operations like NOT)
+ */
+typedef void (*rspamd_expression_atom_foreach_cb_ex)(GNode *atom_node,
+                                                                                                        rspamd_expression_atom_t *atom,
+                                                                                                        gpointer ud);
+
+/**
+ * Traverse over all atoms in the expression with access to AST nodes
+ * @param expr expression
+ * @param cb callback to be called with GNode and full atom structure
+ * @param ud opaque data passed to `cb`
+ */
+void rspamd_expression_atom_foreach_ex(struct rspamd_expression *expr,
+                                                                          rspamd_expression_atom_foreach_cb_ex cb, gpointer cbdata);
+
 /**
  * Checks if a specified node in AST is the specified operation
  * @param node AST node packed in GNode container