]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] re_cache: Always use raw mode for SARAWBODY regexps
authorVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 4 Feb 2026 18:44:18 +0000 (18:44 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 4 Feb 2026 18:44:18 +0000 (18:44 +0000)
The parsed content is transfer-decoded (base64/QP) but NOT charset-converted,
so it may contain non-UTF-8 data even when IS_TEXT_PART_UTF is true.

Using dynamic raw flag based on IS_TEXT_PART_UTF was incorrect because that
flag indicates whether utf_content is valid UTF-8, not whether parsed content
is valid UTF-8.

Bug introduced in 0d62dd6513 (1.8.3), this restores the original behavior of
always treating SARAWBODY content as raw.

src/libserver/re_cache.c

index f6c6b8ce98bed440300246caca96ad0980231b7e..9e6e0504e83c80e1208f9a010351ab2fba7697d5 100644 (file)
@@ -1585,6 +1585,9 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                 * encoding, but HTML tags and line breaks will still be present.
                 * Multiline expressions will need to be used to match strings that are
                 * broken by line breaks.
+                *
+                * Note: parsed content is transfer-decoded but NOT charset-converted,
+                * so it may contain non-UTF-8 data. Always use raw mode.
                 */
                if (MESSAGE_FIELD(task, text_parts)->len > 0) {
                        cnt = MESSAGE_FIELD(task, text_parts)->len;
@@ -1597,10 +1600,6 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                                if (text_part->parsed.len > 0) {
                                        scvec[i] = (unsigned char *) text_part->parsed.begin;
                                        lenvec[i] = text_part->parsed.len;
-
-                                       if (!IS_TEXT_PART_UTF(text_part)) {
-                                               raw = TRUE;
-                                       }
                                }
                                else {
                                        scvec[i] = (unsigned char *) "";
@@ -1608,8 +1607,9 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
                                }
                        }
 
+                       /* Always raw - parsed content is not charset-converted */
                        ret = rspamd_re_cache_process_regexp_data(rt, re,
-                                                                                                         task, scvec, lenvec, cnt, raw, &processed_hyperscan);
+                                                                                                         task, scvec, lenvec, cnt, TRUE, &processed_hyperscan);
                        msg_debug_re_task("checked sa rawbody regexp: %s -> %d",
                                                          rspamd_regexp_get_pattern(re), ret);
                        g_free(scvec);