]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Resolve composite references in inverted index for fast path
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 13 Feb 2026 07:37:27 +0000 (07:37 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 13 Feb 2026 07:37:27 +0000 (07:37 +0000)
When the inverted index fast path collects potentially active composites,
it scans symbols already in the scan result. Composite symbols are not
yet in the scan result at that point, so composites referencing other
composites as atoms were never activated and silently failed to fire.

Fix by resolving composite references at index build time: recursively
collect leaf (non-composite) atoms and propagate dependent composites
to those atoms' index entries. Also refactor atom name parsing into a
shared helper to reduce duplication.

src/libserver/composites/composites_manager.cxx

index f951e37e0b3e07f76a3e6b6de6b04a617cdd389f..cfb04ec2087c9d6a6fd361c5e2cb96abea6a0530 100644 (file)
@@ -481,14 +481,6 @@ void composites_manager::process_dependencies()
                                         (int) first_pass_composites.size(), (int) second_pass_composites.size());
 }
 
-/* Context for building inverted index */
-struct inverted_index_cbdata {
-       composites_manager *cm;
-       rspamd_composite *comp;
-       bool has_positive;
-       bool has_group_atom; /* Composite uses group matcher (g:, g+:, g-:) */
-};
-
 /*
  * Count NOT operations from atom to root to determine atom polarity.
  * Even number of NOTs = positive atom (must be true for expression to be true)
@@ -511,25 +503,18 @@ atom_is_negated(GNode *atom_node)
        return (not_count & 1) != 0;
 }
 
-static void
-inverted_index_atom_callback(GNode *atom_node, rspamd_expression_atom_t *atom, gpointer ud)
+/* Extract the symbol name from an expression atom string, stripping prefixes (~, -, ^)
+ * and option brackets ([...]). Returns empty string for group matchers or invalid atoms. */
+static std::string
+extract_atom_symbol_name(const rspamd_expression_atom_t *atom)
 {
-       auto *cbd = reinterpret_cast<inverted_index_cbdata *>(ud);
-
-       /* Check atom polarity by counting NOTs to root */
-       if (atom_is_negated(atom_node)) {
-               /* Negated atom - don't add to inverted index */
-               return;
-       }
-
        if (atom->str == nullptr || atom->len == 0) {
-               return;
+               return {};
        }
 
-       /* Get the atom string with correct length */
        std::string_view atom_str(atom->str, atom->len);
 
-       /* Skip prefix characters (~, -, ^) to find the symbol name */
+       /* Skip prefix characters (~, -, ^) */
        size_t sym_start = 0;
        while (sym_start < atom_str.size() &&
                   (atom_str[sym_start] == '~' || atom_str[sym_start] == '-' || atom_str[sym_start] == '^')) {
@@ -537,31 +522,110 @@ inverted_index_atom_callback(GNode *atom_node, rspamd_expression_atom_t *atom, g
        }
 
        if (sym_start >= atom_str.size()) {
-               return;
+               return {};
        }
 
        std::string_view remaining = atom_str.substr(sym_start);
 
-       /* Check for group matchers: g:, g+:, g-: */
+       /* Skip group matchers: g:, g+:, g-: */
        if (remaining.size() >= 2 && remaining.substr(0, 2) == "g:") {
-               cbd->has_group_atom = true;
-               return;
+               return {};
        }
        if (remaining.size() >= 3 && (remaining.substr(0, 3) == "g+:" || remaining.substr(0, 3) == "g-:")) {
-               cbd->has_group_atom = true;
-               return;
+               return {};
        }
 
        /* Find end of symbol name (before '[' if present for options) */
        auto bracket_pos = remaining.find('[');
-       std::string symbol_name;
        if (bracket_pos != std::string_view::npos) {
-               symbol_name = std::string(remaining.substr(0, bracket_pos));
+               return std::string(remaining.substr(0, bracket_pos));
        }
-       else {
-               symbol_name = std::string(remaining);
+
+       return std::string(remaining);
+}
+
+/*
+ * Recursively collect all non-composite (leaf) positive atom symbol names
+ * for a composite. Handles transitive composite references with cycle detection.
+ */
+static void
+collect_leaf_atoms(composites_manager *cm, const rspamd_composite *comp,
+                                  ankerl::unordered_dense::set<std::string> &leaf_atoms,
+                                  ankerl::unordered_dense::set<int> &visited)
+{
+       if (visited.contains(comp->id)) {
+               return;
+       }
+       visited.insert(comp->id);
+
+       struct walk_data {
+               composites_manager *cm;
+               ankerl::unordered_dense::set<std::string> *leaves;
+               ankerl::unordered_dense::set<int> *visited;
+       } wd{cm, &leaf_atoms, &visited};
+
+       rspamd_expression_atom_foreach_ex(comp->expr, [](GNode *node, rspamd_expression_atom_t *atom, gpointer ud) {
+                       auto *wd = reinterpret_cast<walk_data *>(ud);
+
+                       if (atom_is_negated(node)) {
+                               return;
+                       }
+
+                       auto sym = extract_atom_symbol_name(atom);
+                       if (sym.empty()) {
+                               return;
+                       }
+
+                       auto *ref = wd->cm->find(sym);
+                       if (ref != nullptr) {
+                               /* Atom is a composite reference - recurse into it */
+                               collect_leaf_atoms(wd->cm, ref, *wd->leaves, *wd->visited);
+                       }
+                       else {
+                               wd->leaves->insert(std::move(sym));
+                       } }, &wd);
+}
+
+/* Context for building inverted index */
+struct inverted_index_cbdata {
+       composites_manager *cm;
+       rspamd_composite *comp;
+       bool has_positive;
+       bool has_group_atom; /* Composite uses group matcher (g:, g+:, g-:) */
+};
+
+static void
+inverted_index_atom_callback(GNode *atom_node, rspamd_expression_atom_t *atom, gpointer ud)
+{
+       auto *cbd = reinterpret_cast<inverted_index_cbdata *>(ud);
+
+       /* Check atom polarity by counting NOTs to root */
+       if (atom_is_negated(atom_node)) {
+               /* Negated atom - don't add to inverted index */
+               return;
+       }
+
+       if (atom->str == nullptr || atom->len == 0) {
+               return;
+       }
+
+       /* Check for group matchers first (need to look at full atom string) */
+       std::string_view atom_str(atom->str, atom->len);
+       size_t sym_start = 0;
+       while (sym_start < atom_str.size() &&
+                  (atom_str[sym_start] == '~' || atom_str[sym_start] == '-' || atom_str[sym_start] == '^')) {
+               ++sym_start;
+       }
+       if (sym_start < atom_str.size()) {
+               auto remaining = atom_str.substr(sym_start);
+               if ((remaining.size() >= 2 && remaining.substr(0, 2) == "g:") ||
+                       (remaining.size() >= 3 && (remaining.substr(0, 3) == "g+:" || remaining.substr(0, 3) == "g-:"))) {
+                       cbd->has_group_atom = true;
+                       return;
+               }
        }
 
+       auto symbol_name = extract_atom_symbol_name(atom);
        if (symbol_name.empty()) {
                return;
        }
@@ -602,6 +666,80 @@ void composites_manager::build_inverted_index()
                }
        }
 
+       /*
+        * Resolve composite references in the inverted index.
+        *
+        * When a composite C references another composite D as an atom, the inverted
+        * index will have an entry D -> [C]. But D is not in the scan result at lookup
+        * time (it gets inserted during composite evaluation), so C would never be
+        * activated via the fast path.
+        *
+        * Fix: for each composite-name key in the index, recursively find its leaf
+        * (non-composite) atoms and propagate the dependents to those leaf atoms'
+        * entries. Then remove the composite-name keys.
+        */
+       ankerl::unordered_dense::set<std::string> composite_keys;
+       for (const auto &[sym, _]: symbol_to_composites) {
+               if (find(sym) != nullptr) {
+                       composite_keys.insert(sym);
+               }
+       }
+
+       if (!composite_keys.empty()) {
+               msg_debug_config("resolving %d composite references in inverted index",
+                                                (int) composite_keys.size());
+
+               for (const auto &comp_key: composite_keys) {
+                       auto it = symbol_to_composites.find(comp_key);
+                       if (it == symbol_to_composites.end()) {
+                               continue;
+                       }
+
+                       auto dependents = it->second; /* Copy before modifying the map */
+                       auto *ref_comp = find(comp_key);
+
+                       /* Collect all leaf (non-composite) atoms reachable from this composite */
+                       ankerl::unordered_dense::set<std::string> leaf_atoms;
+                       ankerl::unordered_dense::set<int> visited;
+                       collect_leaf_atoms(this, ref_comp, leaf_atoms, visited);
+
+                       /* Propagate dependents to each leaf atom's index entry */
+                       for (const auto &leaf: leaf_atoms) {
+                               auto &entry = symbol_to_composites[leaf];
+                               for (auto *dep: dependents) {
+                                       if (std::find(entry.begin(), entry.end(), dep) == entry.end()) {
+                                               entry.push_back(dep);
+                                       }
+                               }
+                       }
+
+                       /*
+                        * If no leaf atoms found (e.g. composite depends only on other
+                        * composites with only negated/group atoms), ensure the dependent
+                        * composites are always checked.
+                        */
+                       if (leaf_atoms.empty()) {
+                               for (auto *dep: dependents) {
+                                       if (std::find(not_only_composites.begin(),
+                                                                 not_only_composites.end(), dep) == not_only_composites.end()) {
+                                               not_only_composites.push_back(dep);
+                                               msg_debug_config("composite '%s' depends on composite '%s' "
+                                                                                "with no leaf atoms, will always be checked",
+                                                                                dep->sym.c_str(), comp_key.c_str());
+                                       }
+                               }
+                       }
+
+                       /* Remove the composite-name key from the index */
+                       symbol_to_composites.erase(comp_key);
+
+                       msg_debug_config("resolved composite reference '%s': "
+                                                        "propagated %d dependents to %d leaf atoms",
+                                                        comp_key.c_str(), (int) dependents.size(),
+                                                        (int) leaf_atoms.size());
+               }
+       }
+
        msg_debug_config("inverted index built: %d unique symbols, %d not-only composites",
                                         (int) symbol_to_composites.size(), (int) not_only_composites.size());
 }