#include "libmime/lang_detection.h"
#include "mempool_vars_internal.h"
#include "lua/lua_classnames.h"
+#include "libserver/composites/composites.h"
#include <math.h>
/* 60 seconds for worker's IO */
ucl_object_insert_key(top,
ucl_object_fromint(mem_st.fragmented_size), "fragmented", 0, false);
+ /* Composites statistics */
+ if (ctx->cfg->composites_manager) {
+ struct rspamd_composites_stats_export comp_stats;
+ ucl_object_t *comp_obj, *time_obj;
+
+ rspamd_composites_get_stats(ctx->cfg->composites_manager, &comp_stats);
+
+ comp_obj = ucl_object_typed_new(UCL_OBJECT);
+ ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.checked_slow),
+ "checked_slow", 0, false);
+ ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.checked_fast),
+ "checked_fast", 0, false);
+ ucl_object_insert_key(comp_obj, ucl_object_fromint(comp_stats.matched),
+ "matched", 0, false);
+
+ time_obj = ucl_object_typed_new(UCL_OBJECT);
+ ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_slow_mean),
+ "mean", 0, false);
+ ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_slow_stddev),
+ "stddev", 0, false);
+ ucl_object_insert_key(time_obj, ucl_object_fromint(comp_stats.time_slow_count),
+ "count", 0, false);
+ ucl_object_insert_key(comp_obj, time_obj, "time_slow_ms", 0, false);
+
+ time_obj = ucl_object_typed_new(UCL_OBJECT);
+ ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_fast_mean),
+ "mean", 0, false);
+ ucl_object_insert_key(time_obj, ucl_object_fromdouble(comp_stats.time_fast_stddev),
+ "stddev", 0, false);
+ ucl_object_insert_key(time_obj, ucl_object_fromint(comp_stats.time_fast_count),
+ "count", 0, false);
+ ucl_object_insert_key(comp_obj, time_obj, "time_fast_ms", 0, false);
+
+ ucl_object_insert_key(comp_obj,
+ ucl_object_frombool(rspamd_composites_get_inverted_index(ctx->cfg->composites_manager)),
+ "inverted_index_enabled", 0, false);
+
+ ucl_object_insert_key(top, comp_obj, "composites", 0, false);
+ }
+
if (do_reset) {
session->ctx->srv->stat->messages_scanned = 0;
session->ctx->srv->stat->messages_learned = 0;
gboolean enable_mime_utf; /**< Enable utf8 mime parsing */
gboolean enable_url_rewrite; /**< Enable HTML URL rewriting */
+ gboolean composites_inverted_index; /**< Use inverted index for composite lookup */
+ gboolean composites_stats_always; /**< Always collect composite stats (not sampled) */
+
gsize max_cores_size; /**< maximum size occupied by rspamd core files */
gsize max_cores_count; /**< maximum number of core files */
char *cores_dir; /**< directory for core files */
G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite),
0,
"Enable HTML URL rewriting");
+ rspamd_rcl_add_default_handler(sub,
+ "composites_inverted_index",
+ rspamd_rcl_parse_struct_boolean,
+ G_STRUCT_OFFSET(struct rspamd_config, composites_inverted_index),
+ 0,
+ "Use inverted index for fast composite lookup (default: true)");
+ rspamd_rcl_add_default_handler(sub,
+ "composites_stats_always",
+ rspamd_rcl_parse_struct_boolean,
+ G_STRUCT_OFFSET(struct rspamd_config, composites_stats_always),
+ 0,
+ "Always collect composite statistics instead of probabilistic sampling (default: false)");
rspamd_rcl_add_default_handler(sub,
"url_rewrite_lua_func",
rspamd_rcl_parse_struct_string,
cfg->enable_mime_utf = false;
cfg->enable_url_rewrite = false;
cfg->url_rewrite_lua_func = nullptr;
+ cfg->composites_inverted_index = true; /* Enable inverted index by default */
+ cfg->composites_stats_always = false; /* Use probabilistic sampling by default */
cfg->url_rewrite_fold_limit = 76;
cfg->script_modules = g_ptr_array_new();
/* Process composite dependencies after symcache is initialized */
if (cfg->composites_manager && rspamd_composites_manager_nelts(cfg->composites_manager) > 0) {
+ /* Apply config options to composites manager */
+ rspamd_composites_set_inverted_index(cfg->composites_manager,
+ cfg->composites_inverted_index);
rspamd_composites_process_deps(cfg->composites_manager, cfg);
}
}
#include "utlist.h"
#include "scan_result.h"
#include "composites.h"
+#include "contrib/libev/ev.h"
+#include "libutil/util.h"
#include <cmath>
#include <vector>
}
}
+/* Sampling rate for timing measurements: 1 in 256 tasks */
+constexpr uint64_t COMPOSITES_SAMPLING_MASK = 0xFF;
+
static void
composites_metric_callback(struct rspamd_task *task)
{
struct rspamd_scan_result *mres;
auto *cm = COMPOSITE_MANAGER_FROM_PTR(task->cfg->composites_manager);
bool is_second_pass = (task->processed_stages & RSPAMD_TASK_STAGE_POST_FILTERS) != 0;
+ bool use_fast_path = cm->use_inverted_index && !is_second_pass;
+
+ /* Probabilistic sampling for timing measurements (unless always_sample is set in config) */
+ bool do_sample = task->cfg->composites_stats_always ||
+ (rspamd_random_uint64_fast() & COMPOSITES_SAMPLING_MASK) == 0;
+ ev_tstamp start_time = 0.0;
+
+ if (do_sample && task->event_loop) {
+ ev_now_update_if_cheap(task->event_loop);
+ start_time = ev_now(task->event_loop);
+ }
+
+ uint64_t composites_checked = 0;
comp_data_vec.reserve(1);
(gpointer) comp,
&cd);
}
+ composites_checked += cm->second_pass_composites.size();
}
- else {
- /* First pass: use inverted index for fast lookup */
+ else if (use_fast_path) {
+ /* First pass with inverted index: fast lookup */
ankerl::unordered_dense::set<rspamd_composite *> potentially_active;
/* Callback data for collecting potentially active composites */
(gpointer) comp,
&cd);
}
+ composites_checked += potentially_active.size();
+ }
+ else {
+ /* Slow path: check all first-pass composites */
+ msg_debug_composites("processing all %d first-pass composites (slow path)",
+ (int) cm->first_pass_composites.size());
+ for (auto *comp: cm->first_pass_composites) {
+ composites_foreach_callback((gpointer) comp->sym.c_str(),
+ (gpointer) comp,
+ &cd);
+ }
+ composites_checked += cm->first_pass_composites.size();
+ }
+ }
+
+ /* Update statistics */
+ if (use_fast_path) {
+ cm->stats.checked_fast += composites_checked;
+ }
+ else if (!is_second_pass) {
+ cm->stats.checked_slow += composites_checked;
+ }
+
+ /* Record timing with EMA */
+ if (do_sample && task->event_loop) {
+ ev_now_update_if_cheap(task->event_loop);
+ ev_tstamp elapsed_ms = (ev_now(task->event_loop) - start_time) * 1000.0;
+
+ if (use_fast_path) {
+ rspamd_set_counter_ema(&cm->stats.time_fast, elapsed_ms, 0.5);
+ }
+ else if (!is_second_pass) {
+ rspamd_set_counter_ema(&cm->stats.time_slow, elapsed_ms, 0.5);
}
}
*/
void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg);
+/**
+ * Enable or disable inverted index for fast composite lookup
+ * @param cm_ptr composites manager pointer
+ * @param enabled true to enable, false to disable
+ */
+void rspamd_composites_set_inverted_index(void *cm_ptr, gboolean enabled);
+
+/**
+ * Get whether inverted index is enabled
+ * @param cm_ptr composites manager pointer
+ * @return true if enabled
+ */
+gboolean rspamd_composites_get_inverted_index(void *cm_ptr);
+
+/**
+ * Statistics structure for composite processing
+ */
+struct rspamd_composites_stats_export {
+ uint64_t checked_slow; /**< composites checked via slow path */
+ uint64_t checked_fast; /**< composites checked via inverted index */
+ uint64_t matched; /**< composites that matched */
+ double time_slow_mean; /**< EMA mean time in slow path (ms) */
+ double time_slow_stddev; /**< EMA stddev time in slow path (ms) */
+ double time_fast_mean; /**< EMA mean time in fast path (ms) */
+ double time_fast_stddev; /**< EMA stddev time in fast path (ms) */
+ uint64_t time_slow_count; /**< number of slow path measurements */
+ uint64_t time_fast_count; /**< number of fast path measurements */
+};
+
+/**
+ * Get composite processing statistics
+ * @param cm_ptr composites manager pointer
+ * @param stats output structure
+ */
+void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats);
+
#ifdef __cplusplus
}
#endif
#include <string>
#include "libutil/expression.h"
+#include "libutil/util.h"
#include "libutil/cxx/hash_util.hxx"
#include "libserver/cfg_file.h"
#define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast<rspamd::composites::composites_manager *>(ptr))
+/**
+ * Statistics for composite processing
+ */
+struct composites_stats {
+ uint64_t checked_slow = 0; /**< composites checked via slow path */
+ uint64_t checked_fast = 0; /**< composites checked via inverted index */
+ uint64_t matched = 0; /**< composites that matched */
+ struct rspamd_counter_data time_slow{}; /**< EMA timing for slow path */
+ struct rspamd_counter_data time_fast{}; /**< EMA timing for fast path */
+};
+
class composites_manager {
public:
composites_manager(struct rspamd_config *_cfg)
- : cfg(_cfg)
+ : cfg(_cfg), use_inverted_index(true)
{
rspamd_mempool_add_destructor(_cfg->cfg_pool, composites_manager_dtor, this);
}
/* Composites that have only negated atoms (must always be checked) */
std::vector<rspamd_composite *> not_only_composites;
+ /* Configuration flags */
+ bool use_inverted_index; /**< Use inverted index for fast composite lookup (default: true) */
+
+ /* Statistics (updated probabilistically for performance) */
+ composites_stats stats{};
+
/* Analyze composite dependencies and split into first/second pass vectors */
void process_dependencies();
/* Build inverted index for fast composite lookup */
cm->process_dependencies();
rspamd_composites_resolve_atom_types(cm);
cm->build_inverted_index();
+}
+
+void rspamd_composites_set_inverted_index(void *cm_ptr, gboolean enabled)
+{
+ auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+ cm->use_inverted_index = enabled;
+}
+
+gboolean rspamd_composites_get_inverted_index(void *cm_ptr)
+{
+ auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+ return cm->use_inverted_index;
+}
+
+void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats)
+{
+ auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+
+ stats->checked_slow = cm->stats.checked_slow;
+ stats->checked_fast = cm->stats.checked_fast;
+ stats->matched = cm->stats.matched;
+
+ stats->time_slow_mean = cm->stats.time_slow.mean;
+ stats->time_slow_stddev = cm->stats.time_slow.stddev;
+ stats->time_slow_count = cm->stats.time_slow.number;
+
+ stats->time_fast_mean = cm->stats.time_fast.mean;
+ stats->time_fast_stddev = cm->stats.time_fast.stddev;
+ stats->time_fast_count = cm->stats.time_fast.number;
}
\ No newline at end of file