]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add bloom filter for fast negative symbol lookups
authorVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Nov 2025 17:50:48 +0000 (17:50 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Nov 2025 17:50:48 +0000 (17:50 +0000)
Add an inline bloom filter (1024 bits) to rspamd_scan_result structure
for O(1) negative lookups in rspamd_task_find_symbol_result().

This optimization benefits composites evaluation where most symbol
lookups are negative (symbol not present in results). The bloom filter
is updated when symbols are inserted and checked before the hash lookup.

For 50 symbols, the false positive rate is approximately 0.5%, meaning
99.5% of negative lookups will be rejected without hash table access.

src/libmime/scan_result.c
src/libmime/scan_result.h

index 992a8ea49b4494f36b6361c87fab3a49354ef95a..948756cdb501236ea827f5ebf3096f90e8fae6bf 100644 (file)
 
 INIT_LOG_MODULE(metric)
 
+/*
+ * Bloom filter helpers for fast negative symbol lookups.
+ * Uses two hash functions for better distribution.
+ */
+static inline uint32_t
+rspamd_bloom_hash1(const char *s)
+{
+       /* wyhash-based, same as kh_str_hash_func */
+       uint32_t h = 0xcafebabe;
+       while (*s) {
+               h ^= (uint32_t) *s++;
+               h *= 0x5bd1e995;
+               h ^= h >> 15;
+       }
+       return h;
+}
+
+static inline uint32_t
+rspamd_bloom_hash2(uint32_t h)
+{
+       /* Secondary hash using murmur-like mixing */
+       h ^= h >> 16;
+       h *= 0x85ebca6b;
+       h ^= h >> 13;
+       h *= 0xc2b2ae35;
+       h ^= h >> 16;
+       return h;
+}
+
+#define RSPAMD_BLOOM_BITS (RSPAMD_BLOOM_SIZE * 8)
+#define RSPAMD_BLOOM_MASK (RSPAMD_BLOOM_BITS - 1)
+#define rspamd_bloom_set(bv, h) ((bv)[(h) >> 3] |= (1U << ((h) & 7)))
+#define rspamd_bloom_test(bv, h) ((bv)[(h) >> 3] & (1U << ((h) & 7)))
+
+static inline void
+rspamd_bloom_add(uint8_t *bloom, const char *symbol)
+{
+       uint32_t h1 = rspamd_bloom_hash1(symbol);
+       uint32_t h2 = rspamd_bloom_hash2(h1);
+       rspamd_bloom_set(bloom, h1 & RSPAMD_BLOOM_MASK);
+       rspamd_bloom_set(bloom, h2 & RSPAMD_BLOOM_MASK);
+}
+
+static inline int
+rspamd_bloom_check(const uint8_t *bloom, const char *symbol)
+{
+       uint32_t h1 = rspamd_bloom_hash1(symbol);
+       uint32_t h2 = rspamd_bloom_hash2(h1);
+       return rspamd_bloom_test(bloom, h1 & RSPAMD_BLOOM_MASK) &&
+                  rspamd_bloom_test(bloom, h2 & RSPAMD_BLOOM_MASK);
+}
+
 /* Average symbols count to optimize hash allocation */
 static struct rspamd_counter_data symbols_count;
 
@@ -444,6 +496,9 @@ insert_metric_result(struct rspamd_task *task,
                symbol_result = rspamd_mempool_alloc0(task->task_pool, sizeof(*symbol_result));
                kh_value(metric_res->symbols, k) = symbol_result;
 
+               /* Add to bloom filter for fast negative lookups */
+               rspamd_bloom_add(metric_res->symbols_bloom, sym_cpy);
+
                symbol_result->name = sym_cpy;
                symbol_result->sym = sdef;
                symbol_result->nshots = 1;
@@ -978,6 +1033,11 @@ rspamd_task_find_symbol_result(struct rspamd_task *task, const char *sym,
                result = task->result;
        }
 
+       /* Fast path: bloom filter check for negative lookups */
+       if (!rspamd_bloom_check(result->symbols_bloom, sym)) {
+               return NULL;
+       }
+
        k = kh_get(rspamd_symbols_hash, result->symbols, sym);
 
        if (k != kh_end(result->symbols)) {
index 12fdb94599f81b58fc23742ef46269125b6a2c74..557c8c8a18b8e80e9b8de769df16d206acb3cc74 100644 (file)
@@ -97,6 +97,8 @@ struct rspamd_action_config {
 struct kh_rspamd_symbols_hash_s;
 struct kh_rspamd_symbols_group_hash_s;
 
+/* Bloom filter for fast negative symbol lookups */
+#define RSPAMD_BLOOM_SIZE 128 /* 1024 bits */
 
 struct rspamd_scan_result {
        double score; /**< total score                                                  */
@@ -104,6 +106,7 @@ struct rspamd_scan_result {
        double positive_score;
        double negative_score;
        struct kh_rspamd_symbols_hash_s *symbols;          /**< symbols of metric                                               */
+       uint8_t symbols_bloom[RSPAMD_BLOOM_SIZE];          /**< bloom filter for fast negative lookups  */
        struct kh_rspamd_symbols_group_hash_s *sym_groups; /**< groups of symbols                                               */
        struct rspamd_action_config *actions_config;
        const char *name;         /**< for named results, NULL is the default result */