From: Vsevolod Stakhov Date: Wed, 26 Nov 2025 21:35:57 +0000 (+0000) Subject: [Feature] Add composites statistics and config options X-Git-Tag: 3.14.1~5^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b0e3a18b8eacb07d9b562a8a72a09e3e540eeab4;p=thirdparty%2Frspamd.git [Feature] Add composites statistics and config options - Add config options: composites_inverted_index (default: true), composites_stats_always (default: false for 1/256 sampling) - Add EMA-based timing statistics using rspamd_counter_data - Export stats via /stat endpoint in "composites" section: checked_slow, checked_fast, matched counts and timing with mean/stddev - Allow toggling inverted index for performance comparison --- diff --git a/src/controller.c b/src/controller.c index c575fbbbc1..8f7a298a77 100644 --- a/src/controller.c +++ b/src/controller.c @@ -35,6 +35,7 @@ #include "libmime/lang_detection.h" #include "mempool_vars_internal.h" #include "lua/lua_classnames.h" +#include "libserver/composites/composites.h" #include /* 60 seconds for worker's IO */ @@ -2916,6 +2917,46 @@ rspamd_controller_handle_stat_common( ucl_object_insert_key(top, ucl_object_fromint(mem_st.fragmented_size), "fragmented", 0, false); + /* Composites statistics */ + if (ctx->cfg->composites_manager) { + struct rspamd_composites_stats_export comp_stats; + ucl_object_t *comp_obj, *time_obj; + + rspamd_composites_get_stats(ctx->cfg->composites_manager, &comp_stats); + + comp_obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.checked_slow), + "checked_slow", 0, false); + ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.checked_fast), + "checked_fast", 0, false); + ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.matched), + "matched", 0, false); + + time_obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_slow_mean), + "mean", 0, false); + ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_slow_stddev), + "stddev", 0, false); + ucl_object_insert_key(time_obj, ucl_object_fromint(comp_stats.time_slow_count), + "count", 0, false); + ucl_object_insert_key(comp_obj, time_obj, "time_slow_ms", 0, false); + + time_obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_fast_mean), + "mean", 0, false); + ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_fast_stddev), + "stddev", 0, false); + ucl_object_insert_key(time_obj, ucl_object_fromint(comp_stats.time_fast_count), + "count", 0, false); + ucl_object_insert_key(comp_obj, time_obj, "time_fast_ms", 0, false); + + ucl_object_insert_key(comp_obj, + ucl_object_frombool(rspamd_composites_get_inverted_index(ctx->cfg->composites_manager)), + "inverted_index_enabled", 0, false); + + ucl_object_insert_key(top, comp_obj, "composites", 0, false); + } + if (do_reset) { session->ctx->srv->stat->messages_scanned = 0; session->ctx->srv->stat->messages_learned = 0; diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index aa3172bf13..8794004be7 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -378,6 +378,9 @@ struct rspamd_config { gboolean enable_mime_utf; /**< Enable utf8 mime parsing */ gboolean enable_url_rewrite; /**< Enable HTML URL rewriting */ + gboolean composites_inverted_index; /**< Use inverted index for composite lookup */ + gboolean composites_stats_always; /**< Always collect composite stats (not sampled) */ + gsize max_cores_size; /**< maximum size occupied by rspamd core files */ gsize max_cores_count; /**< maximum number of core files */ char *cores_dir; /**< directory for core files */ diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx index 3c13487400..59f498bb2a 100644 --- a/src/libserver/cfg_rcl.cxx +++ b/src/libserver/cfg_rcl.cxx @@ -2117,6 +2117,18 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite), 0, "Enable HTML URL rewriting"); + rspamd_rcl_add_default_handler(sub, + "composites_inverted_index", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, composites_inverted_index), + 0, + "Use inverted index for fast composite lookup (default: true)"); + rspamd_rcl_add_default_handler(sub, + "composites_stats_always", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, composites_stats_always), + 0, + "Always collect composite statistics instead of probabilistic sampling (default: false)"); rspamd_rcl_add_default_handler(sub, "url_rewrite_lua_func", rspamd_rcl_parse_struct_string, diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx index 1b7d6609ad..2446db1da5 100644 --- a/src/libserver/cfg_utils.cxx +++ b/src/libserver/cfg_utils.cxx @@ -353,6 +353,8 @@ rspamd_config_new(enum rspamd_config_init_flags flags) cfg->enable_mime_utf = false; cfg->enable_url_rewrite = false; cfg->url_rewrite_lua_func = nullptr; + cfg->composites_inverted_index = true; /* Enable inverted index by default */ + cfg->composites_stats_always = false; /* Use probabilistic sampling by default */ cfg->url_rewrite_fold_limit = 76; cfg->script_modules = g_ptr_array_new(); @@ -1010,6 +1012,9 @@ rspamd_config_post_load(struct rspamd_config *cfg, /* Process composite dependencies after symcache is initialized */ if (cfg->composites_manager && rspamd_composites_manager_nelts(cfg->composites_manager) > 0) { + /* Apply config options to composites manager */ + rspamd_composites_set_inverted_index(cfg->composites_manager, + cfg->composites_inverted_index); rspamd_composites_process_deps(cfg->composites_manager, cfg); } } diff --git a/src/libserver/composites/composites.cxx b/src/libserver/composites/composites.cxx index dccd15cb8d..608b16a3a0 100644 --- a/src/libserver/composites/composites.cxx +++ b/src/libserver/composites/composites.cxx @@ -20,6 +20,8 @@ #include "utlist.h" #include "scan_result.h" #include "composites.h" +#include "contrib/libev/ev.h" +#include "libutil/util.h" #include #include @@ -967,6 +969,9 @@ remove_symbols(const composites_data &cd, const std::vector } } +/* Sampling rate for timing measurements: 1 in 256 tasks */ +constexpr uint64_t COMPOSITES_SAMPLING_MASK = 0xFF; + static void composites_metric_callback(struct rspamd_task *task) { @@ -974,6 +979,19 @@ composites_metric_callback(struct rspamd_task *task) struct rspamd_scan_result *mres; auto *cm = COMPOSITE_MANAGER_FROM_PTR(task->cfg->composites_manager); bool is_second_pass = (task->processed_stages & RSPAMD_TASK_STAGE_POST_FILTERS) != 0; + bool use_fast_path = cm->use_inverted_index && !is_second_pass; + + /* Probabilistic sampling for timing measurements (unless always_sample is set in config) */ + bool do_sample = task->cfg->composites_stats_always || + (rspamd_random_uint64_fast() & COMPOSITES_SAMPLING_MASK) == 0; + ev_tstamp start_time = 0.0; + + if (do_sample && task->event_loop) { + ev_now_update_if_cheap(task->event_loop); + start_time = ev_now(task->event_loop); + } + + uint64_t composites_checked = 0; comp_data_vec.reserve(1); @@ -989,9 +1007,10 @@ composites_metric_callback(struct rspamd_task *task) (gpointer) comp, &cd); } + composites_checked += cm->second_pass_composites.size(); } - else { - /* First pass: use inverted index for fast lookup */ + else if (use_fast_path) { + /* First pass with inverted index: fast lookup */ ankerl::unordered_dense::set potentially_active; /* Callback data for collecting potentially active composites */ @@ -1032,6 +1051,39 @@ composites_metric_callback(struct rspamd_task *task) (gpointer) comp, &cd); } + composites_checked += potentially_active.size(); + } + else { + /* Slow path: check all first-pass composites */ + msg_debug_composites("processing all %d first-pass composites (slow path)", + (int) cm->first_pass_composites.size()); + for (auto *comp: cm->first_pass_composites) { + composites_foreach_callback((gpointer) comp->sym.c_str(), + (gpointer) comp, + &cd); + } + composites_checked += cm->first_pass_composites.size(); + } + } + + /* Update statistics */ + if (use_fast_path) { + cm->stats.checked_fast += composites_checked; + } + else if (!is_second_pass) { + cm->stats.checked_slow += composites_checked; + } + + /* Record timing with EMA */ + if (do_sample && task->event_loop) { + ev_now_update_if_cheap(task->event_loop); + ev_tstamp elapsed_ms = (ev_now(task->event_loop) - start_time) * 1000.0; + + if (use_fast_path) { + rspamd_set_counter_ema(&cm->stats.time_fast, elapsed_ms, 0.5); + } + else if (!is_second_pass) { + rspamd_set_counter_ema(&cm->stats.time_slow, elapsed_ms, 0.5); } } diff --git a/src/libserver/composites/composites.h b/src/libserver/composites/composites.h index cff7d67de1..63761eed29 100644 --- a/src/libserver/composites/composites.h +++ b/src/libserver/composites/composites.h @@ -65,6 +65,42 @@ void *rspamd_composites_manager_add_from_string_silent(void *, const char *, con */ void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg); +/** + * Enable or disable inverted index for fast composite lookup + * @param cm_ptr composites manager pointer + * @param enabled true to enable, false to disable + */ +void rspamd_composites_set_inverted_index(void *cm_ptr, gboolean enabled); + +/** + * Get whether inverted index is enabled + * @param cm_ptr composites manager pointer + * @return true if enabled + */ +gboolean rspamd_composites_get_inverted_index(void *cm_ptr); + +/** + * Statistics structure for composite processing + */ +struct rspamd_composites_stats_export { + uint64_t checked_slow; /**< composites checked via slow path */ + uint64_t checked_fast; /**< composites checked via inverted index */ + uint64_t matched; /**< composites that matched */ + double time_slow_mean; /**< EMA mean time in slow path (ms) */ + double time_slow_stddev; /**< EMA stddev time in slow path (ms) */ + double time_fast_mean; /**< EMA mean time in fast path (ms) */ + double time_fast_stddev; /**< EMA stddev time in fast path (ms) */ + uint64_t time_slow_count; /**< number of slow path measurements */ + uint64_t time_fast_count; /**< number of fast path measurements */ +}; + +/** + * Get composite processing statistics + * @param cm_ptr composites manager pointer + * @param stats output structure + */ +void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats); + #ifdef __cplusplus } #endif diff --git a/src/libserver/composites/composites_internal.hxx b/src/libserver/composites/composites_internal.hxx index 8cce77f267..4f10eca338 100644 --- a/src/libserver/composites/composites_internal.hxx +++ b/src/libserver/composites/composites_internal.hxx @@ -20,6 +20,7 @@ #include #include "libutil/expression.h" +#include "libutil/util.h" #include "libutil/cxx/hash_util.hxx" #include "libserver/cfg_file.h" @@ -53,10 +54,21 @@ struct rspamd_composite { #define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast(ptr)) +/** + * Statistics for composite processing + */ +struct composites_stats { + uint64_t checked_slow = 0; /**< composites checked via slow path */ + uint64_t checked_fast = 0; /**< composites checked via inverted index */ + uint64_t matched = 0; /**< composites that matched */ + struct rspamd_counter_data time_slow{}; /**< EMA timing for slow path */ + struct rspamd_counter_data time_fast{}; /**< EMA timing for fast path */ +}; + class composites_manager { public: composites_manager(struct rspamd_config *_cfg) - : cfg(_cfg) + : cfg(_cfg), use_inverted_index(true) { rspamd_mempool_add_destructor(_cfg->cfg_pool, composites_manager_dtor, this); } @@ -122,6 +134,12 @@ public: /* Composites that have only negated atoms (must always be checked) */ std::vector not_only_composites; + /* Configuration flags */ + bool use_inverted_index; /**< Use inverted index for fast composite lookup (default: true) */ + + /* Statistics (updated probabilistically for performance) */ + composites_stats stats{}; + /* Analyze composite dependencies and split into first/second pass vectors */ void process_dependencies(); /* Build inverted index for fast composite lookup */ diff --git a/src/libserver/composites/composites_manager.cxx b/src/libserver/composites/composites_manager.cxx index df09cea156..5663762f1f 100644 --- a/src/libserver/composites/composites_manager.cxx +++ b/src/libserver/composites/composites_manager.cxx @@ -613,4 +613,33 @@ void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg) cm->process_dependencies(); rspamd_composites_resolve_atom_types(cm); cm->build_inverted_index(); +} + +void rspamd_composites_set_inverted_index(void *cm_ptr, gboolean enabled) +{ + auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr); + cm->use_inverted_index = enabled; +} + +gboolean rspamd_composites_get_inverted_index(void *cm_ptr) +{ + auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr); + return cm->use_inverted_index; +} + +void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats) +{ + auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr); + + stats->checked_slow = cm->stats.checked_slow; + stats->checked_fast = cm->stats.checked_fast; + stats->matched = cm->stats.matched; + + stats->time_slow_mean = cm->stats.time_slow.mean; + stats->time_slow_stddev = cm->stats.time_slow.stddev; + stats->time_slow_count = cm->stats.time_slow.number; + + stats->time_fast_mean = cm->stats.time_fast.mean; + stats->time_fast_stddev = cm->stats.time_fast.stddev; + stats->time_fast_count = cm->stats.time_fast.number; } \ No newline at end of file