From: Vsevolod Stakhov Date: Tue, 25 Nov 2025 17:50:48 +0000 (+0000) Subject: [Feature] Add bloom filter for fast negative symbol lookups X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=dd9935d9f983c2fc3827e0d6e052f0c4ca305098;p=thirdparty%2Frspamd.git [Feature] Add bloom filter for fast negative symbol lookups Add an inline bloom filter (1024 bits) to rspamd_scan_result structure for O(1) negative lookups in rspamd_task_find_symbol_result(). This optimization benefits composites evaluation where most symbol lookups are negative (symbol not present in results). The bloom filter is updated when symbols are inserted and checked before the hash lookup. For 50 symbols, the false positive rate is approximately 0.5%, meaning 99.5% of negative lookups will be rejected without hash table access. --- diff --git a/src/libmime/scan_result.c b/src/libmime/scan_result.c index 992a8ea49b..948756cdb5 100644 --- a/src/libmime/scan_result.c +++ b/src/libmime/scan_result.c @@ -32,6 +32,58 @@ INIT_LOG_MODULE(metric) +/* + * Bloom filter helpers for fast negative symbol lookups. + * Uses two hash functions for better distribution. + */ +static inline uint32_t +rspamd_bloom_hash1(const char *s) +{ + /* wyhash-based, same as kh_str_hash_func */ + uint32_t h = 0xcafebabe; + while (*s) { + h ^= (uint32_t) *s++; + h *= 0x5bd1e995; + h ^= h >> 15; + } + return h; +} + +static inline uint32_t +rspamd_bloom_hash2(uint32_t h) +{ + /* Secondary hash using murmur-like mixing */ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +#define RSPAMD_BLOOM_BITS (RSPAMD_BLOOM_SIZE * 8) +#define RSPAMD_BLOOM_MASK (RSPAMD_BLOOM_BITS - 1) +#define rspamd_bloom_set(bv, h) ((bv)[(h) >> 3] |= (1U << ((h) & 7))) +#define rspamd_bloom_test(bv, h) ((bv)[(h) >> 3] & (1U << ((h) & 7))) + +static inline void +rspamd_bloom_add(uint8_t *bloom, const char *symbol) +{ + uint32_t h1 = rspamd_bloom_hash1(symbol); + uint32_t h2 = rspamd_bloom_hash2(h1); + rspamd_bloom_set(bloom, h1 & RSPAMD_BLOOM_MASK); + rspamd_bloom_set(bloom, h2 & RSPAMD_BLOOM_MASK); +} + +static inline int +rspamd_bloom_check(const uint8_t *bloom, const char *symbol) +{ + uint32_t h1 = rspamd_bloom_hash1(symbol); + uint32_t h2 = rspamd_bloom_hash2(h1); + return rspamd_bloom_test(bloom, h1 & RSPAMD_BLOOM_MASK) && + rspamd_bloom_test(bloom, h2 & RSPAMD_BLOOM_MASK); +} + /* Average symbols count to optimize hash allocation */ static struct rspamd_counter_data symbols_count; @@ -444,6 +496,9 @@ insert_metric_result(struct rspamd_task *task, symbol_result = rspamd_mempool_alloc0(task->task_pool, sizeof(*symbol_result)); kh_value(metric_res->symbols, k) = symbol_result; + /* Add to bloom filter for fast negative lookups */ + rspamd_bloom_add(metric_res->symbols_bloom, sym_cpy); + symbol_result->name = sym_cpy; symbol_result->sym = sdef; symbol_result->nshots = 1; @@ -978,6 +1033,11 @@ rspamd_task_find_symbol_result(struct rspamd_task *task, const char *sym, result = task->result; } + /* Fast path: bloom filter check for negative lookups */ + if (!rspamd_bloom_check(result->symbols_bloom, sym)) { + return NULL; + } + k = kh_get(rspamd_symbols_hash, result->symbols, sym); if (k != kh_end(result->symbols)) { diff --git a/src/libmime/scan_result.h b/src/libmime/scan_result.h index 12fdb94599..557c8c8a18 100644 --- a/src/libmime/scan_result.h +++ b/src/libmime/scan_result.h @@ -97,6 +97,8 @@ struct rspamd_action_config { struct kh_rspamd_symbols_hash_s; struct kh_rspamd_symbols_group_hash_s; +/* Bloom filter for fast negative symbol lookups */ +#define RSPAMD_BLOOM_SIZE 128 /* 1024 bits */ struct rspamd_scan_result { double score; /**< total score */ @@ -104,6 +106,7 @@ struct rspamd_scan_result { double positive_score; double negative_score; struct kh_rspamd_symbols_hash_s *symbols; /**< symbols of metric */ + uint8_t symbols_bloom[RSPAMD_BLOOM_SIZE]; /**< bloom filter for fast negative lookups */ struct kh_rspamd_symbols_group_hash_s *sym_groups; /**< groups of symbols */ struct rspamd_action_config *actions_config; const char *name; /**< for named results, NULL is the default result */