From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Thu, 5 Feb 2026 08:53:07 +0000 (+0000)
Subject: [Fix] re_cache: Use charset-converted content for UTF-8 SARAWBODY patterns
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f18410725bd8c74b68c5fe300dde05283acb8590;p=thirdparty%2Frspamd.git

[Fix] re_cache: Use charset-converted content for UTF-8 SARAWBODY patterns

When SARAWBODY regexp class contains UTF-8 patterns (/u flag), use
utf_content (charset-converted UTF-8 with HTML preserved) instead of
parsed content. This allows Unicode patterns like \x{200b} to match
correctly.

For non-UTF patterns, continue using parsed content with raw mode
for backward compatibility with raw byte matching.

This fixes "bad utf8 input for JIT re" errors when using Unicode
patterns in rawbody rules on non-UTF-8 encoded messages.
---

diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index 9e6e0504e8..da76182936 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -1586,8 +1586,10 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
 		 * Multiline expressions will need to be used to match strings that are
 		 * broken by line breaks.
 		 *
-		 * Note: parsed content is transfer-decoded but NOT charset-converted,
-		 * so it may contain non-UTF-8 data. Always use raw mode.
+		 * If the regexp class contains UTF-8 patterns (/u flag), we use
+		 * charset-converted utf_content to allow Unicode matching.
+		 * Otherwise, we use parsed content (transfer-decoded only) for
+		 * backward compatibility with raw byte matching.
 		 */
 		if (MESSAGE_FIELD(task, text_parts)->len > 0) {
 			cnt = MESSAGE_FIELD(task, text_parts)->len;
@@ -1597,19 +1599,43 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
 			for (i = 0; i < cnt; i++) {
 				text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i);
 
-				if (text_part->parsed.len > 0) {
-					scvec[i] = (unsigned char *) text_part->parsed.begin;
-					lenvec[i] = text_part->parsed.len;
+				if (re_class->has_utf8) {
+					/*
+					 * Use charset-converted content for UTF-8 patterns.
+					 * This allows Unicode matching while preserving HTML tags.
+					 */
+					if (text_part->utf_content.len > 0) {
+						scvec[i] = (unsigned char *) text_part->utf_content.begin;
+						lenvec[i] = text_part->utf_content.len;
+
+						if (!IS_TEXT_PART_UTF(text_part)) {
+							raw = TRUE;
+						}
+					}
+					else {
+						scvec[i] = (unsigned char *) "";
+						lenvec[i] = 0;
+					}
 				}
 				else {
-					scvec[i] = (unsigned char *) "";
-					lenvec[i] = 0;
+					/*
+					 * Use transfer-decoded content for raw byte matching.
+					 * This is not charset-converted, so always use raw mode.
+					 */
+					if (text_part->parsed.len > 0) {
+						scvec[i] = (unsigned char *) text_part->parsed.begin;
+						lenvec[i] = text_part->parsed.len;
+					}
+					else {
+						scvec[i] = (unsigned char *) "";
+						lenvec[i] = 0;
+					}
+					raw = TRUE;
 				}
 			}
 
-			/* Always raw - parsed content is not charset-converted */
 			ret = rspamd_re_cache_process_regexp_data(rt, re,
-													  task, scvec, lenvec, cnt, TRUE, &processed_hyperscan);
+													  task, scvec, lenvec, cnt, raw, &processed_hyperscan);
 			msg_debug_re_task("checked sa rawbody regexp: %s -> %d",
 							  rspamd_regexp_get_pattern(re), ret);
 			g_free(scvec);