]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Exclude injected parts from SA body/rawbody regexp scanning
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 28 Mar 2026 09:07:19 +0000 (09:07 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 28 Mar 2026 09:07:19 +0000 (09:07 +0000)
PDF text extraction injects synthetic text parts that were being scanned
by sa_body and sa_raw_body rules, causing 30x false positive increase on
rules matching null bytes (e.g. /\x00/{sa_raw_body}). PDF hex strings
produce raw bytes including \x00 which are not meaningful in extracted text.

Two fixes:
- Strip null bytes and control characters from extracted PDF text in
  sanitize_pdf_text() for the non-UTF-16 code path
- Skip RSPAMD_MIME_PART_COMPUTED parts in SA body and rawbody scanning
  to follow original SA semantics where only real MIME text parts are matched

lualib/lua_content/pdf.lua
src/libserver/re_cache.c

index 089971c446003d18bfa4b71437e967793cdee8e0..f9920887ed582f377494b648e9aaf56722fe081c 100644 (file)
@@ -466,6 +466,27 @@ local function gen_text_grammar()
        end
     end
 
+    -- Strip null bytes and control characters from non-UTF-16 text.
+    -- PDF hex strings like <0041> produce raw bytes including \x00 which
+    -- are not meaningful in extracted text and cause false positives in
+    -- sa_raw_body rules matching \x00 patterns.
+    local has_control = false
+    for i = 1, len do
+      local b = s:byte(i)
+      if b == 0 or (b < 32 and b ~= 9 and b ~= 10 and b ~= 13) then
+        has_control = true
+        break
+      end
+    end
+
+    if has_control then
+      -- Remove null bytes and other control characters (keep tab, newline, carriage return)
+      s = s:gsub('[%z\1-\8\11\12\14-\31]', '')
+      if #s == 0 then
+        return ''
+      end
+    end
+
     return s
   end
 
index a65325200a5caac6f503b4f963d6377d2d2e18ff..f91cc53179f5a4cc3c6c52a9b4904f1cbee888ac 100644 (file)
@@ -1667,26 +1667,38 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                        lenvec[0] = 0;
                }
 
-               PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
                {
-                       if (text_part->utf_stripped_content) {
-                               scvec[i + 1] = (unsigned char *) text_part->utf_stripped_content->data;
-                               lenvec[i + 1] = text_part->utf_stripped_content->len;
+                       unsigned int real_cnt = 1; /* start at 1 for Subject */
 
-                               if (!IS_TEXT_PART_UTF(text_part)) {
-                                       raw = TRUE;
+                       PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
+                       {
+                               /* Skip injected/computed parts (e.g. PDF extracted text) —
+                                * SA body should only scan original MIME text parts */
+                               if (text_part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED) {
+                                       continue;
                                }
+
+                               if (text_part->utf_stripped_content) {
+                                       scvec[real_cnt] = (unsigned char *) text_part->utf_stripped_content->data;
+                                       lenvec[real_cnt] = text_part->utf_stripped_content->len;
+
+                                       if (!IS_TEXT_PART_UTF(text_part)) {
+                                               raw = TRUE;
+                                       }
+                               }
+                               else {
+                                       scvec[real_cnt] = (unsigned char *) "";
+                                       lenvec[real_cnt] = 0;
+                               }
+
+                               real_cnt++;
                        }
-                       else {
-                               scvec[i + 1] = (unsigned char *) "";
-                               lenvec[i + 1] = 0;
-                       }
-               }
 
-               ret = rspamd_re_cache_process_regexp_data(rt, re,
-                                                                                                 task, scvec, lenvec, cnt, raw, &processed_hyperscan);
-               msg_debug_re_task("checked sa body regexp: %s -> %d",
-                                                 rspamd_regexp_get_pattern(re), ret);
+                       ret = rspamd_re_cache_process_regexp_data(rt, re,
+                                                                                                         task, scvec, lenvec, real_cnt, raw, &processed_hyperscan);
+                       msg_debug_re_task("checked sa body regexp: %s -> %d",
+                                                         rspamd_regexp_get_pattern(re), ret);
+               }
                g_free(scvec);
                g_free(lenvec);
                break;
@@ -1711,9 +1723,17 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                        scvec = g_malloc(sizeof(*scvec) * cnt);
                        lenvec = g_malloc(sizeof(*lenvec) * cnt);
 
+                       unsigned int real_cnt = 0;
+
                        for (i = 0; i < cnt; i++) {
                                text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i);
 
+                               /* Skip injected/computed parts (e.g. PDF extracted text) —
+                                * SA rawbody should only scan original MIME text parts */
+                               if (text_part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED) {
+                                       continue;
+                               }
+
                                if (text_part->utf_raw_content != NULL &&
                                        text_part->utf_raw_content->len > 0) {
                                        /*
@@ -1721,8 +1741,8 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                                         * preserved. This is the correct representation for
                                         * SA rawbody matching.
                                         */
-                                       scvec[i] = text_part->utf_raw_content->data;
-                                       lenvec[i] = text_part->utf_raw_content->len;
+                                       scvec[real_cnt] = text_part->utf_raw_content->data;
+                                       lenvec[real_cnt] = text_part->utf_raw_content->len;
 
                                        if (!IS_TEXT_PART_UTF(text_part)) {
                                                raw = TRUE;
@@ -1733,20 +1753,24 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                                         * Charset conversion failed; fall back to
                                         * transfer-decoded content in raw mode.
                                         */
-                                       scvec[i] = (unsigned char *) text_part->parsed.begin;
-                                       lenvec[i] = text_part->parsed.len;
+                                       scvec[real_cnt] = (unsigned char *) text_part->parsed.begin;
+                                       lenvec[real_cnt] = text_part->parsed.len;
                                        raw = TRUE;
                                }
                                else {
-                                       scvec[i] = (unsigned char *) "";
-                                       lenvec[i] = 0;
+                                       scvec[real_cnt] = (unsigned char *) "";
+                                       lenvec[real_cnt] = 0;
                                }
+
+                               real_cnt++;
                        }
 
-                       ret = rspamd_re_cache_process_regexp_data(rt, re,
-                                                                                                         task, scvec, lenvec, cnt, raw, &processed_hyperscan);
-                       msg_debug_re_task("checked sa rawbody regexp: %s -> %d",
-                                                         rspamd_regexp_get_pattern(re), ret);
+                       if (real_cnt > 0) {
+                               ret = rspamd_re_cache_process_regexp_data(rt, re,
+                                                                                                                 task, scvec, lenvec, real_cnt, raw, &processed_hyperscan);
+                               msg_debug_re_task("checked sa rawbody regexp: %s -> %d",
+                                                                 rspamd_regexp_get_pattern(re), ret);
+                       }
                        g_free(scvec);
                        g_free(lenvec);
                }