From 16e3dbe5bbdcfa96c1e7d822d4bcdead7967a848 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 6 Feb 2026 11:22:07 +0000 Subject: [PATCH] [Fix] re_cache: Always use charset-converted content for SARAWBODY matching Use utf_raw_content (charset-converted UTF-8 with HTML tags preserved) for all SARAWBODY patterns, regardless of /u flag presence. The previous approach used utf_content (which strips HTML tags on HTML parts) and only for classes containing /u patterns, leaving non-/u patterns matching against raw bytes in the original charset. This prevents trivial bypass of SA rawbody rules via exotic encodings like UTF-16 and ensures consistent matching across PCRE and Hyperscan. Falls back to transfer-decoded parsed content only when charset conversion failed. --- src/libserver/re_cache.c | 53 +++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 5fa410969b..3e8e77faf4 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1586,10 +1586,13 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, * Multiline expressions will need to be used to match strings that are * broken by line breaks. * - * If the regexp class contains UTF-8 patterns (/u flag), we use - * charset-converted utf_content to allow Unicode matching. - * Otherwise, we use parsed content (transfer-decoded only) for - * backward compatibility with raw byte matching. + * We always use utf_raw_content (charset-converted to UTF-8 with + * HTML tags preserved) so that patterns match consistently + * regardless of the original message encoding. This prevents + * trivial bypass via exotic charsets like UTF-16. + * + * If charset conversion failed (utf_raw_content is NULL), fall + * back to parsed content (transfer-decoded only) with raw mode. */ if (MESSAGE_FIELD(task, text_parts)->len > 0) { cnt = MESSAGE_FIELD(task, text_parts)->len; @@ -1599,39 +1602,33 @@ rspamd_re_cache_exec_re(struct rspamd_task *task, for (i = 0; i < cnt; i++) { text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i); - if (re_class->has_utf8) { + if (text_part->utf_raw_content != NULL && + text_part->utf_raw_content->len > 0) { /* - * Use charset-converted content for UTF-8 patterns. - * This allows Unicode matching while preserving HTML tags. + * Use charset-converted UTF-8 content with HTML tags + * preserved. This is the correct representation for + * SA rawbody matching. */ - if (text_part->utf_content.len > 0) { - scvec[i] = (unsigned char *) text_part->utf_content.begin; - lenvec[i] = text_part->utf_content.len; + scvec[i] = text_part->utf_raw_content->data; + lenvec[i] = text_part->utf_raw_content->len; - if (!IS_TEXT_PART_UTF(text_part)) { - raw = TRUE; - } - } - else { - scvec[i] = (unsigned char *) ""; - lenvec[i] = 0; + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; } } - else { + else if (text_part->parsed.len > 0) { /* - * Use transfer-decoded content for raw byte matching. - * This is not charset-converted, so always use raw mode. + * Charset conversion failed; fall back to + * transfer-decoded content in raw mode. */ - if (text_part->parsed.len > 0) { - scvec[i] = (unsigned char *) text_part->parsed.begin; - lenvec[i] = text_part->parsed.len; - } - else { - scvec[i] = (unsigned char *) ""; - lenvec[i] = 0; - } + scvec[i] = (unsigned char *) text_part->parsed.begin; + lenvec[i] = text_part->parsed.len; raw = TRUE; } + else { + scvec[i] = (unsigned char *) ""; + lenvec[i] = 0; + } } ret = rspamd_re_cache_process_regexp_data(rt, re, -- 2.47.3