]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Compile small hyperscan databases in memory without file caching
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 10 Jan 2026 21:18:21 +0000 (21:18 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 10 Jan 2026 21:18:21 +0000 (21:18 +0000)
For small pattern sets (< 100 patterns), compile hyperscan databases
synchronously in memory without saving to file or Redis cache.
These databases are shared with workers via fork() COW semantics.

Large pattern sets (like TLD with 10000+ patterns) continue to use
async compilation via hs_helper with Redis caching.

This eliminates unnecessary .hs files in /var/lib/rspamd for small
databases while maintaining the async path for expensive compilations.

src/libserver/maps/map_helpers.c
src/libutil/multipattern.c

index 3f5598b2de998a0f006d3200958fbe6606c97559..2dc3f88e331504e3711bed587a03e21a967c6960 100644 (file)
 static const uint64_t map_hash_seed = 0xdeadbabeULL;
 static const char *const hash_fill = "1";
 
+#ifdef WITH_HYPERSCAN
+/*
+ * Threshold for "small" regexp maps that are compiled in memory
+ * without file/Redis caching. These are shared with workers via fork().
+ * Maps above this threshold use async compilation with caching.
+ */
+#define RSPAMD_REGEXP_MAP_SMALL_THRESHOLD 100
+#endif
+
 struct rspamd_map_helper_value {
        gsize hits;
        gconstpointer key;
@@ -1230,16 +1239,62 @@ rspamd_re_map_finalize(struct rspamd_regexp_map_helper *re_map)
        }
 
        /*
-        * Instead of compiling hyperscan synchronously here (which blocks the main process),
-        * we add this map to the pending compilation queue. The hs_helper worker will compile
-        * the hyperscan database asynchronously and notify workers when it's ready.
-        *
-        * In the meantime, we use PCRE fallback via the regexps array.
+        * Small regexp maps: compile in memory synchronously.
+        * They will be shared with workers via fork() COW.
+        * Large maps: queue for async compilation via hs_helper.
         */
-       msg_info_map("regexp map %s (%ud patterns) queued for async hyperscan compilation, using PCRE fallback",
-                                map->name, re_map->regexps->len);
+       if (re_map->regexps->len < RSPAMD_REGEXP_MAP_SMALL_THRESHOLD) {
+               hs_database_t *db = NULL;
+               hs_compile_error_t *hs_errors = NULL;
+
+               msg_info_map("regexp map %s (%ud patterns) is small, compiling in memory",
+                                        map->name, re_map->regexps->len);
+
+               if (hs_compile_multi((const char **) re_map->patterns,
+                                                        re_map->flags,
+                                                        re_map->ids,
+                                                        re_map->regexps->len,
+                                                        HS_MODE_BLOCK,
+                                                        &plt,
+                                                        &db,
+                                                        &hs_errors) != HS_SUCCESS) {
+                       msg_warn_map("cannot compile hyperscan for regexp map %s: %s (pattern %d), using PCRE fallback",
+                                                map->name,
+                                                hs_errors ? hs_errors->message : "unknown error",
+                                                hs_errors ? hs_errors->expression : -1);
+                       if (hs_errors) {
+                               hs_free_compile_error(hs_errors);
+                       }
+                       /* Fall through - will use PCRE fallback */
+               }
+               else {
+                       /* Create hyperscan wrapper without file association */
+                       re_map->hs_db = rspamd_hyperscan_from_raw_db(db, NULL);
+
+                       if (hs_alloc_scratch(rspamd_hyperscan_get_database(re_map->hs_db),
+                                                                &re_map->hs_scratch) != HS_SUCCESS) {
+                               msg_err_map("cannot allocate scratch for regexp map %s, using PCRE fallback",
+                                                       map->name);
+                               rspamd_hyperscan_free(re_map->hs_db, true);
+                               re_map->hs_db = NULL;
+                               re_map->hs_scratch = NULL;
+                       }
+                       else {
+                               msg_info_map("regexp map %s compiled in memory successfully",
+                                                        map->name);
+                       }
+               }
+       }
+       else {
+               /*
+                * Large regexp maps: queue for async compilation via hs_helper.
+                * Use PCRE fallback until hyperscan is ready.
+                */
+               msg_info_map("regexp map %s (%ud patterns) queued for async hyperscan compilation, using PCRE fallback",
+                                        map->name, re_map->regexps->len);
 
-       rspamd_regexp_map_add_pending(re_map, map->name);
+               rspamd_regexp_map_add_pending(re_map, map->name);
+       }
 #endif
 }
 
index 127233c876cd07640ac79da0cdf745b039ba1408..de99229e77ced6f198389d186cbbc9319a270fe0 100644 (file)
 
 #define MAX_SCRATCH 4
 
+/*
+ * Threshold for "small" multipatterns that are compiled in memory
+ * without file/Redis caching. These are shared with workers via fork().
+ * Patterns above this threshold use async compilation with caching.
+ */
+#define RSPAMD_MULTIPATTERN_SMALL_THRESHOLD 100
+
 #define msg_debug_multipattern(...) rspamd_conditional_debug_fast(NULL, NULL,                 \
                                                                                                                                  rspamd_multipattern_log_id, \
                                                                                                                                  "multipattern", NULL,       \
@@ -819,6 +826,21 @@ rspamd_multipattern_compile(struct rspamd_multipattern *mp, int flags, GError **
                }
 
                /* SYNC mode (default): try cache, sync compile on miss */
+
+               /* Small patterns: compile in memory, no file caching.
+                * They will be shared with workers via fork() COW. */
+               if (mp->cnt < RSPAMD_MULTIPATTERN_SMALL_THRESHOLD) {
+                       msg_debug_multipattern("small pattern set (%ud < %d), compiling in memory",
+                                                                  mp->cnt, RSPAMD_MULTIPATTERN_SMALL_THRESHOLD);
+                       if (!rspamd_multipattern_compile_hs_sync(mp, hash,
+                                                                                                        flags | RSPAMD_MULTIPATTERN_COMPILE_NO_FS, err)) {
+                               return FALSE;
+                       }
+                       mp->compiled = TRUE;
+                       return TRUE;
+               }
+
+               /* Large patterns: use file cache */
                if (!(flags & RSPAMD_MULTIPATTERN_COMPILE_NO_FS) &&
                        rspamd_multipattern_try_load_hs(mp, hash)) {
                        if (!rspamd_multipattern_alloc_scratch(mp, err)) {