[Fix] multipattern: fix TLD pattern matching after hyperscan hot-swap

author Vsevolod Stakhov <vsevolod@rspamd.com>

Sun, 1 Feb 2026 15:52:56 +0000 (15:52 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Sun, 1 Feb 2026 15:52:56 +0000 (15:52 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Sun, 1 Feb 2026 15:52:56 +0000 (15:52 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Sun, 1 Feb 2026 15:52:56 +0000 (15:52 +0000)
diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c

index b681c9598bb0d2d641d94967d016efaace36794f..16449cab881f4238ba87b7d9bd93c6a66f749f6f 100644 (file)
--- a/src/libutil/multipattern.c
+++ b/src/libutil/multipattern.c
@@ -124,16 +124,17 @@ rspamd_multipattern_escape_tld_hyperscan(const char *pattern, gsize slen,
                                                                                  gsize *dst_len)
  {
         gsize len;
-       const char *p, *prefix, *suffix;
+       const char *p, *prefix;
         char *res;
  
         /*
          * We understand the following cases
-        * 1) blah -> \.blah(?:[^a-zA-Z0-9]|$)
-        * 2) *.blah -> \.blah(?:[^a-zA-Z0-9]|$)
+        * 1) blah -> \.blah
+        * 2) *.blah -> \.blah
          *
-        * Note: We use (?:[^a-zA-Z0-9]|$) instead of \b because \b requires
-        * HS_FLAG_UCP which we don't set for TLD patterns.
+        * Boundary checking (non-alphanumeric after TLD) is done in the
+        * hyperscan callback (rspamd_multipattern_hs_cb), similar to ACISM.
+        * This ensures match length doesn't include the boundary character.
          */
  
         if (pattern[0] == '*') {
@@ -156,14 +157,9 @@ rspamd_multipattern_escape_tld_hyperscan(const char *pattern, gsize slen,
                 len = slen + strlen(prefix);
         }
  
-       /* Match end of TLD: either non-alphanumeric or end of string */
-       suffix = "(?:[^a-zA-Z0-9]|$)";
-       len += strlen(suffix);
-
         res = g_malloc(len + 1);
         slen = rspamd_strlcpy(res, prefix, len + 1);
         slen += rspamd_strlcpy(res + slen, p, len + 1 - slen);
-       slen += rspamd_strlcpy(res + slen, suffix, len + 1 - slen);
  
         *dst_len = slen;
  
@@ -880,6 +876,18 @@ rspamd_multipattern_hs_cb(unsigned int id,
                         from = 0;
                 }
  
+               /* For TLD patterns, check word boundary at end of match (like ACISM callback) */
+               if (cbd->mp->pats != NULL && id < cbd->mp->pats->len) {
+                       struct rspamd_acism_pat *pat = &g_array_index(cbd->mp->pats,
+                                                                                                                 struct rspamd_acism_pat, id);
+                       if (pat->is_tld) {
+                               if (to < cbd->len && g_ascii_isalnum(cbd->in[to])) {
+                                       /* TLD followed by alphanumeric - not a valid boundary */
+                                       return 0;
+                               }
+                       }
+               }
+
                 ret = cbd->cb(cbd->mp, id, from, to, cbd->in, cbd->len, cbd->ud);
  
                 cbd->nfound++;
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Sun, 1 Feb 2026 15:52:56 +0000 (15:52 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Sun, 1 Feb 2026 15:52:56 +0000 (15:52 +0000)