]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add composites statistics and config options
authorVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 26 Nov 2025 21:35:57 +0000 (21:35 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 26 Nov 2025 21:35:57 +0000 (21:35 +0000)
- Add config options: composites_inverted_index (default: true),
  composites_stats_always (default: false for 1/256 sampling)
- Add EMA-based timing statistics using rspamd_counter_data
- Export stats via /stat endpoint in "composites" section:
  checked_slow, checked_fast, matched counts and timing with mean/stddev
- Allow toggling inverted index for performance comparison

src/controller.c
src/libserver/cfg_file.h
src/libserver/cfg_rcl.cxx
src/libserver/cfg_utils.cxx
src/libserver/composites/composites.cxx
src/libserver/composites/composites.h
src/libserver/composites/composites_internal.hxx
src/libserver/composites/composites_manager.cxx

index c575fbbbc15eae72001d77bb37395e734d7cc5bf..8f7a298a77c70ec3069680c48b2c98cf99fe14e3 100644 (file)
@@ -35,6 +35,7 @@
 #include "libmime/lang_detection.h"
 #include "mempool_vars_internal.h"
 #include "lua/lua_classnames.h"
+#include "libserver/composites/composites.h"
 #include <math.h>
 
 /* 60 seconds for worker's IO */
@@ -2916,6 +2917,46 @@ rspamd_controller_handle_stat_common(
        ucl_object_insert_key(top,
                                                  ucl_object_fromint(mem_st.fragmented_size), "fragmented", 0, false);
 
+       /* Composites statistics */
+       if (ctx->cfg->composites_manager) {
+               struct rspamd_composites_stats_export comp_stats;
+               ucl_object_t *comp_obj, *time_obj;
+
+               rspamd_composites_get_stats(ctx->cfg->composites_manager, &comp_stats);
+
+               comp_obj = ucl_object_typed_new(UCL_OBJECT);
+               ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.checked_slow),
+                                                         "checked_slow", 0, false);
+               ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.checked_fast),
+                                                         "checked_fast", 0, false);
+               ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.matched),
+                                                         "matched", 0, false);
+
+               time_obj = ucl_object_typed_new(UCL_OBJECT);
+               ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_slow_mean),
+                                                         "mean", 0, false);
+               ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_slow_stddev),
+                                                         "stddev", 0, false);
+               ucl_object_insert_key(time_obj, ucl_object_fromint(comp_stats.time_slow_count),
+                                                         "count", 0, false);
+               ucl_object_insert_key(comp_obj, time_obj, "time_slow_ms", 0, false);
+
+               time_obj = ucl_object_typed_new(UCL_OBJECT);
+               ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_fast_mean),
+                                                         "mean", 0, false);
+               ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_fast_stddev),
+                                                         "stddev", 0, false);
+               ucl_object_insert_key(time_obj, ucl_object_fromint(comp_stats.time_fast_count),
+                                                         "count", 0, false);
+               ucl_object_insert_key(comp_obj, time_obj, "time_fast_ms", 0, false);
+
+               ucl_object_insert_key(comp_obj,
+                                                         ucl_object_frombool(rspamd_composites_get_inverted_index(ctx->cfg->composites_manager)),
+                                                         "inverted_index_enabled", 0, false);
+
+               ucl_object_insert_key(top, comp_obj, "composites", 0, false);
+       }
+
        if (do_reset) {
                session->ctx->srv->stat->messages_scanned = 0;
                session->ctx->srv->stat->messages_learned = 0;
index aa3172bf13070443914886a3bab18c1e1e462c08..8794004be79de661ce9ccca9dba51c49e2d38d75 100644 (file)
@@ -378,6 +378,9 @@ struct rspamd_config {
        gboolean enable_mime_utf;                                /**< Enable utf8 mime parsing                                                  */
        gboolean enable_url_rewrite;                             /**< Enable HTML URL rewriting                                                 */
 
+       gboolean composites_inverted_index; /**< Use inverted index for composite lookup                        */
+       gboolean composites_stats_always;   /**< Always collect composite stats (not sampled)           */
+
        gsize max_cores_size;        /**< maximum size occupied by rspamd core files                    */
        gsize max_cores_count;       /**< maximum number of core files                                          */
        char *cores_dir;             /**< directory for core files                                                      */
index 3c13487400df391df37e06aab8abab17f5b12e82..59f498bb2ac1d2ba68f420ef35e70c31a6c2268e 100644 (file)
@@ -2117,6 +2117,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections)
                                                                           G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite),
                                                                           0,
                                                                           "Enable HTML URL rewriting");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "composites_inverted_index",
+                                                                          rspamd_rcl_parse_struct_boolean,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, composites_inverted_index),
+                                                                          0,
+                                                                          "Use inverted index for fast composite lookup (default: true)");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "composites_stats_always",
+                                                                          rspamd_rcl_parse_struct_boolean,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, composites_stats_always),
+                                                                          0,
+                                                                          "Always collect composite statistics instead of probabilistic sampling (default: false)");
                rspamd_rcl_add_default_handler(sub,
                                                                           "url_rewrite_lua_func",
                                                                           rspamd_rcl_parse_struct_string,
index 1b7d6609ad8dd794a831e16bda8b845914df365e..2446db1da5350a2b316d55af0ffedc746f5a8758 100644 (file)
@@ -353,6 +353,8 @@ rspamd_config_new(enum rspamd_config_init_flags flags)
        cfg->enable_mime_utf = false;
        cfg->enable_url_rewrite = false;
        cfg->url_rewrite_lua_func = nullptr;
+       cfg->composites_inverted_index = true; /* Enable inverted index by default */
+       cfg->composites_stats_always = false;  /* Use probabilistic sampling by default */
        cfg->url_rewrite_fold_limit = 76;
        cfg->script_modules = g_ptr_array_new();
 
@@ -1010,6 +1012,9 @@ rspamd_config_post_load(struct rspamd_config *cfg,
 
                /* Process composite dependencies after symcache is initialized */
                if (cfg->composites_manager && rspamd_composites_manager_nelts(cfg->composites_manager) > 0) {
+                       /* Apply config options to composites manager */
+                       rspamd_composites_set_inverted_index(cfg->composites_manager,
+                                                                                                cfg->composites_inverted_index);
                        rspamd_composites_process_deps(cfg->composites_manager, cfg);
                }
        }
index dccd15cb8dfde9d57cbb63d9b59d13c588aca05c..608b16a3a027e89d89b0a98f882b37f25592bb58 100644 (file)
@@ -20,6 +20,8 @@
 #include "utlist.h"
 #include "scan_result.h"
 #include "composites.h"
+#include "contrib/libev/ev.h"
+#include "libutil/util.h"
 
 #include <cmath>
 #include <vector>
@@ -967,6 +969,9 @@ remove_symbols(const composites_data &cd, const std::vector<symbol_remove_data>
        }
 }
 
+/* Sampling rate for timing measurements: 1 in 256 tasks */
+constexpr uint64_t COMPOSITES_SAMPLING_MASK = 0xFF;
+
 static void
 composites_metric_callback(struct rspamd_task *task)
 {
@@ -974,6 +979,19 @@ composites_metric_callback(struct rspamd_task *task)
        struct rspamd_scan_result *mres;
        auto *cm = COMPOSITE_MANAGER_FROM_PTR(task->cfg->composites_manager);
        bool is_second_pass = (task->processed_stages & RSPAMD_TASK_STAGE_POST_FILTERS) != 0;
+       bool use_fast_path = cm->use_inverted_index && !is_second_pass;
+
+       /* Probabilistic sampling for timing measurements (unless always_sample is set in config) */
+       bool do_sample = task->cfg->composites_stats_always ||
+                                        (rspamd_random_uint64_fast() & COMPOSITES_SAMPLING_MASK) == 0;
+       ev_tstamp start_time = 0.0;
+
+       if (do_sample && task->event_loop) {
+               ev_now_update_if_cheap(task->event_loop);
+               start_time = ev_now(task->event_loop);
+       }
+
+       uint64_t composites_checked = 0;
 
        comp_data_vec.reserve(1);
 
@@ -989,9 +1007,10 @@ composites_metric_callback(struct rspamd_task *task)
                                                                                        (gpointer) comp,
                                                                                        &cd);
                        }
+                       composites_checked += cm->second_pass_composites.size();
                }
-               else {
-                       /* First pass: use inverted index for fast lookup */
+               else if (use_fast_path) {
+                       /* First pass with inverted index: fast lookup */
                        ankerl::unordered_dense::set<rspamd_composite *> potentially_active;
 
                        /* Callback data for collecting potentially active composites */
@@ -1032,6 +1051,39 @@ composites_metric_callback(struct rspamd_task *task)
                                                                                        (gpointer) comp,
                                                                                        &cd);
                        }
+                       composites_checked += potentially_active.size();
+               }
+               else {
+                       /* Slow path: check all first-pass composites */
+                       msg_debug_composites("processing all %d first-pass composites (slow path)",
+                                                                (int) cm->first_pass_composites.size());
+                       for (auto *comp: cm->first_pass_composites) {
+                               composites_foreach_callback((gpointer) comp->sym.c_str(),
+                                                                                       (gpointer) comp,
+                                                                                       &cd);
+                       }
+                       composites_checked += cm->first_pass_composites.size();
+               }
+       }
+
+       /* Update statistics */
+       if (use_fast_path) {
+               cm->stats.checked_fast += composites_checked;
+       }
+       else if (!is_second_pass) {
+               cm->stats.checked_slow += composites_checked;
+       }
+
+       /* Record timing with EMA */
+       if (do_sample && task->event_loop) {
+               ev_now_update_if_cheap(task->event_loop);
+               ev_tstamp elapsed_ms = (ev_now(task->event_loop) - start_time) * 1000.0;
+
+               if (use_fast_path) {
+                       rspamd_set_counter_ema(&cm->stats.time_fast, elapsed_ms, 0.5);
+               }
+               else if (!is_second_pass) {
+                       rspamd_set_counter_ema(&cm->stats.time_slow, elapsed_ms, 0.5);
                }
        }
 
index cff7d67de1f4d59291e93ee4ca96b48ec4264f84..63761eed296e9d1dba0deb0f5b6cd1e695e456ba 100644 (file)
@@ -65,6 +65,42 @@ void *rspamd_composites_manager_add_from_string_silent(void *, const char *, con
  */
 void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg);
 
+/**
+ * Enable or disable inverted index for fast composite lookup
+ * @param cm_ptr composites manager pointer
+ * @param enabled true to enable, false to disable
+ */
+void rspamd_composites_set_inverted_index(void *cm_ptr, gboolean enabled);
+
+/**
+ * Get whether inverted index is enabled
+ * @param cm_ptr composites manager pointer
+ * @return true if enabled
+ */
+gboolean rspamd_composites_get_inverted_index(void *cm_ptr);
+
+/**
+ * Statistics structure for composite processing
+ */
+struct rspamd_composites_stats_export {
+       uint64_t checked_slow;    /**< composites checked via slow path */
+       uint64_t checked_fast;    /**< composites checked via inverted index */
+       uint64_t matched;         /**< composites that matched */
+       double time_slow_mean;    /**< EMA mean time in slow path (ms) */
+       double time_slow_stddev;  /**< EMA stddev time in slow path (ms) */
+       double time_fast_mean;    /**< EMA mean time in fast path (ms) */
+       double time_fast_stddev;  /**< EMA stddev time in fast path (ms) */
+       uint64_t time_slow_count; /**< number of slow path measurements */
+       uint64_t time_fast_count; /**< number of fast path measurements */
+};
+
+/**
+ * Get composite processing statistics
+ * @param cm_ptr composites manager pointer
+ * @param stats output structure
+ */
+void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats);
+
 #ifdef __cplusplus
 }
 #endif
index 8cce77f2677737940234d6d704f6eb942c7542d8..4f10eca3380028d21d85227e6eca6c1c503924ab 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <string>
 #include "libutil/expression.h"
+#include "libutil/util.h"
 #include "libutil/cxx/hash_util.hxx"
 #include "libserver/cfg_file.h"
 
@@ -53,10 +54,21 @@ struct rspamd_composite {
 
 #define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast<rspamd::composites::composites_manager *>(ptr))
 
+/**
+ * Statistics for composite processing
+ */
+struct composites_stats {
+       uint64_t checked_slow = 0;              /**< composites checked via slow path */
+       uint64_t checked_fast = 0;              /**< composites checked via inverted index */
+       uint64_t matched = 0;                   /**< composites that matched */
+       struct rspamd_counter_data time_slow{}; /**< EMA timing for slow path */
+       struct rspamd_counter_data time_fast{}; /**< EMA timing for fast path */
+};
+
 class composites_manager {
 public:
        composites_manager(struct rspamd_config *_cfg)
-               : cfg(_cfg)
+               : cfg(_cfg), use_inverted_index(true)
        {
                rspamd_mempool_add_destructor(_cfg->cfg_pool, composites_manager_dtor, this);
        }
@@ -122,6 +134,12 @@ public:
        /* Composites that have only negated atoms (must always be checked) */
        std::vector<rspamd_composite *> not_only_composites;
 
+       /* Configuration flags */
+       bool use_inverted_index; /**< Use inverted index for fast composite lookup (default: true) */
+
+       /* Statistics (updated probabilistically for performance) */
+       composites_stats stats{};
+
        /* Analyze composite dependencies and split into first/second pass vectors */
        void process_dependencies();
        /* Build inverted index for fast composite lookup */
index df09cea1564f790dc436a2fd3406f8437f311afe..5663762f1f5b7a24e92081c8dddd8111d4c24030 100644 (file)
@@ -613,4 +613,33 @@ void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg)
        cm->process_dependencies();
        rspamd_composites_resolve_atom_types(cm);
        cm->build_inverted_index();
+}
+
+void rspamd_composites_set_inverted_index(void *cm_ptr, gboolean enabled)
+{
+       auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+       cm->use_inverted_index = enabled;
+}
+
+gboolean rspamd_composites_get_inverted_index(void *cm_ptr)
+{
+       auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+       return cm->use_inverted_index;
+}
+
+void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats)
+{
+       auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+
+       stats->checked_slow = cm->stats.checked_slow;
+       stats->checked_fast = cm->stats.checked_fast;
+       stats->matched = cm->stats.matched;
+
+       stats->time_slow_mean = cm->stats.time_slow.mean;
+       stats->time_slow_stddev = cm->stats.time_slow.stddev;
+       stats->time_slow_count = cm->stats.time_slow.number;
+
+       stats->time_fast_mean = cm->stats.time_fast.mean;
+       stats->time_fast_stddev = cm->stats.time_fast.stddev;
+       stats->time_fast_count = cm->stats.time_fast.number;
 }
\ No newline at end of file