From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Wed, 4 Feb 2026 12:02:06 +0000 (+0000)
Subject: [Fix] Rework alternative parts detection
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ba35d63be787c3d76f04b0604a83b033f3af5efe;p=thirdparty%2Frspamd.git

[Fix] Rework alternative parts detection
---

diff --git a/src/libmime/message.c b/src/libmime/message.c
index cca5efebed..3b8db30623 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -998,6 +998,121 @@ rspamd_mime_part_is_in_multipart_alternative(struct rspamd_mime_part *part)
 														  "alternative", 11) != NULL;
 }
 
+/*
+ * Check if `ancestor` is an ancestor of `part` (or equal to it)
+ */
+static gboolean
+rspamd_mime_part_is_ancestor(struct rspamd_mime_part *part,
+							 struct rspamd_mime_part *ancestor)
+{
+	while (part) {
+		if (part == ancestor) {
+			return TRUE;
+		}
+		part = part->parent_part;
+	}
+	return FALSE;
+}
+
+/*
+ * Recursively search for a text part in a subtree.
+ * - `root` is the multipart to search in
+ * - `exclude` is the subtree to skip (the branch containing the original part)
+ * - `want_html` TRUE means we want HTML part, FALSE means we want plain text
+ * - Returns the first matching text part, or NULL
+ * - Stops recursion if we hit a message/rfc822 part
+ */
+static struct rspamd_mime_text_part *
+rspamd_mime_part_find_text_in_subtree(struct rspamd_mime_part *root,
+									  struct rspamd_mime_part *exclude,
+									  gboolean want_html)
+{
+	if (!root || !IS_PART_MULTIPART(root) || !root->specific.mp) {
+		return NULL;
+	}
+
+	struct rspamd_mime_multipart *mp = root->specific.mp;
+
+	for (unsigned int i = 0; i < mp->children->len; i++) {
+		struct rspamd_mime_part *child = g_ptr_array_index(mp->children, i);
+
+		/* Skip the excluded subtree */
+		if (exclude && rspamd_mime_part_is_ancestor(exclude, child)) {
+			continue;
+		}
+
+		/* Stop at message/rfc822 parts (embedded messages) */
+		if (IS_PART_MESSAGE(child)) {
+			continue;
+		}
+
+		/* Check if this is the text part we want */
+		if (IS_PART_TEXT(child) && child->specific.txt) {
+			struct rspamd_mime_text_part *txt = child->specific.txt;
+
+			/* Skip attachments */
+			if (IS_TEXT_PART_ATTACHMENT(txt)) {
+				continue;
+			}
+
+			gboolean is_html = IS_TEXT_PART_HTML(txt);
+			if (want_html == is_html) {
+				return txt;
+			}
+		}
+
+		/* Recurse into multiparts */
+		if (IS_PART_MULTIPART(child)) {
+			struct rspamd_mime_text_part *found =
+				rspamd_mime_part_find_text_in_subtree(child, NULL, want_html);
+			if (found) {
+				return found;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Find the alternative text part for a given text part.
+ * Algorithm:
+ * 1. Walk up from the part's mime_part to find a multipart/alternative ancestor
+ * 2. If found, identify which child subtree contains our part
+ * 3. Search other children for a text part of the opposite type
+ *    (plain for HTML, HTML for plain)
+ * 4. Stop if we hit message/rfc822 (embedded message)
+ */
+static struct rspamd_mime_text_part *
+rspamd_mime_text_part_find_alternative(struct rspamd_mime_text_part *text_part)
+{
+	if (!text_part || !text_part->mime_part) {
+		return NULL;
+	}
+
+	struct rspamd_mime_part *mime_part = text_part->mime_part;
+	gboolean is_html = IS_TEXT_PART_HTML(text_part);
+
+	/* Find the multipart/alternative ancestor */
+	struct rspamd_mime_part *alt_parent =
+		rspamd_mime_part_find_parent_multipart_subtype(mime_part->parent_part,
+													   "alternative", 11);
+
+	if (!alt_parent) {
+		/* No alternative parent, no alternative part */
+		return NULL;
+	}
+
+	/* Find the direct child of alt_parent that contains our mime_part */
+	struct rspamd_mime_part *our_branch = mime_part;
+	while (our_branch->parent_part && our_branch->parent_part != alt_parent) {
+		our_branch = our_branch->parent_part;
+	}
+
+	/* Search other branches for a text part of the opposite type */
+	return rspamd_mime_part_find_text_in_subtree(alt_parent, our_branch, !is_html);
+}
+
 static enum rspamd_message_part_is_text_result
 rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task,
 										  struct rspamd_mime_part *mime_part)
@@ -1855,134 +1970,117 @@ void rspamd_message_process(struct rspamd_task *task)
 		}
 	}
 
-	/* Calculate distance for 2-parts messages */
-	if (i == 2) {
-		p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0);
-		p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1);
+	/* Compute alternative text parts for each text part */
+	PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
+	{
+		text_part->alt_text_part = rspamd_mime_text_part_find_alternative(text_part);
+	}
 
-		struct rspamd_mime_part *alt_parent1 = rspamd_mime_part_find_parent_multipart_subtype(
-			p1->mime_part->parent_part, "alternative", 11);
-		struct rspamd_mime_part *alt_parent2 = rspamd_mime_part_find_parent_multipart_subtype(
-			p2->mime_part->parent_part, "alternative", 11);
+	/* Calculate distance for alternative parts */
+	struct rspamd_mime_text_part *html_part = NULL;
 
-		/* We compare parts only if they belong to the same multipart/alternative container */
-		if (alt_parent1 && alt_parent1 == alt_parent2) {
-			if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
-				p1->normalized_hashes && p2->normalized_hashes) {
-				/*
-					 * We also detect language on one part and propagate it to
-					 * another one
-					 */
-				struct rspamd_mime_text_part *sel;
-
-				/* Prefer HTML as text part is not displayed normally */
-				if (IS_TEXT_PART_HTML(p1)) {
-					sel = p1;
-				}
-				else if (IS_TEXT_PART_HTML(p2)) {
-					sel = p2;
-				}
-				else {
-					if (p1->utf_content.len > p2->utf_content.len) {
-						sel = p1;
-					}
-					else {
-						sel = p2;
-					}
-				}
+	/* Find the first non-attachment HTML part that has an alternative */
+	PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
+	{
+		if (IS_TEXT_PART_HTML(text_part) &&
+			!IS_TEXT_PART_ATTACHMENT(text_part) &&
+			text_part->alt_text_part) {
+			html_part = text_part;
+			break;
+		}
+	}
 
-				if (sel->language && sel->language[0]) {
-					/* Propagate language */
-					if (sel == p1) {
-						if (p2->languages) {
-							g_ptr_array_unref(p2->languages);
-						}
+	if (html_part) {
+		p1 = html_part;
+		p2 = html_part->alt_text_part;
 
-						p2->language = sel->language;
-						p2->languages = g_ptr_array_ref(sel->languages);
-					}
-					else {
-						if (p1->languages) {
-							g_ptr_array_unref(p1->languages);
-						}
+		/* alt_text_part already ensures they share a multipart/alternative parent */
+		if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
+			p1->normalized_hashes && p2->normalized_hashes) {
+			/*
+			 * We also detect language on one part and propagate it to
+			 * another one. Prefer HTML as text part is not displayed normally.
+			 */
+			struct rspamd_mime_text_part *sel = p1; /* p1 is always HTML here */
 
-						p1->language = sel->language;
-						p1->languages = g_ptr_array_ref(sel->languages);
-					}
+			if (sel->language && sel->language[0]) {
+				/* Propagate language from HTML to text part */
+				if (p2->languages) {
+					g_ptr_array_unref(p2->languages);
 				}
 
-				tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
+				p2->language = sel->language;
+				p2->languages = g_ptr_array_ref(sel->languages);
+			}
 
-				if (tw > 0) {
-					dw = rspamd_words_levenshtein_distance(task,
-														   p1->normalized_hashes,
-														   p2->normalized_hashes);
-					diff = dw / (double) tw;
+			tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
 
-					msg_debug_task(
-						"different words: %d, total words: %d, "
-						"got diff between parts of %.2f",
-						dw, tw,
-						diff);
+			if (tw > 0) {
+				dw = rspamd_words_levenshtein_distance(task,
+													   p1->normalized_hashes,
+													   p2->normalized_hashes);
+				diff = dw / (double) tw;
 
-					pdiff = rspamd_mempool_alloc(task->task_pool,
-												 sizeof(double));
-					*pdiff = diff;
-					rspamd_mempool_set_variable(task->task_pool,
-												"parts_distance",
-												pdiff,
-												NULL);
-					ptw = rspamd_mempool_alloc(task->task_pool,
-											   sizeof(int));
-					*ptw = tw;
-					rspamd_mempool_set_variable(task->task_pool,
-												"total_words",
-												ptw,
-												NULL);
-				}
-			}
-			else {
-				/*
-					 * Handle cases where parts differ significantly:
-					 * - One part is empty, another is not
-					 * - One part has words, another has none (but isn't empty)
-					 * In both cases, treat as 100% difference
-					 */
-				gboolean p1_has_words = p1->normalized_hashes &&
-										p1->normalized_hashes->len > 0;
-				gboolean p2_has_words = p2->normalized_hashes &&
-										p2->normalized_hashes->len > 0;
-
-				if (p1_has_words != p2_has_words) {
-					struct rspamd_mime_text_part *non_empty =
-						p1_has_words ? p1 : p2;
-
-					tw = non_empty->normalized_hashes->len;
-
-					msg_debug_task(
-						"one part has no words, another has %d words, "
-						"got diff between parts of 1.0",
-						tw);
-
-					pdiff = rspamd_mempool_alloc(task->task_pool,
-												 sizeof(double));
-					*pdiff = 1.0;
-					rspamd_mempool_set_variable(task->task_pool,
-												"parts_distance",
-												pdiff,
-												NULL);
-					ptw = rspamd_mempool_alloc(task->task_pool,
-											   sizeof(int));
-					*ptw = tw;
-					rspamd_mempool_set_variable(task->task_pool,
-												"total_words",
-												ptw,
-												NULL);
-				}
+				msg_debug_task(
+					"different words: %d, total words: %d, "
+					"got diff between parts of %.2f",
+					dw, tw,
+					diff);
+
+				pdiff = rspamd_mempool_alloc(task->task_pool,
+											 sizeof(double));
+				*pdiff = diff;
+				rspamd_mempool_set_variable(task->task_pool,
+											"parts_distance",
+											pdiff,
+											NULL);
+				ptw = rspamd_mempool_alloc(task->task_pool,
+										   sizeof(int));
+				*ptw = tw;
+				rspamd_mempool_set_variable(task->task_pool,
+											"total_words",
+											ptw,
+											NULL);
 			}
 		}
 		else {
-			debug_task("message contains two parts but they are in different multi-parts");
+			/*
+			 * Handle cases where parts differ significantly:
+			 * - One part is empty, another is not
+			 * - One part has words, another has none (but isn't empty)
+			 * In both cases, treat as 100% difference
+			 */
+			gboolean p1_has_words = p1->normalized_hashes &&
+									p1->normalized_hashes->len > 0;
+			gboolean p2_has_words = p2->normalized_hashes &&
+									p2->normalized_hashes->len > 0;
+
+			if (p1_has_words != p2_has_words) {
+				struct rspamd_mime_text_part *non_empty =
+					p1_has_words ? p1 : p2;
+
+				tw = non_empty->normalized_hashes->len;
+
+				msg_debug_task(
+					"one part has no words, another has %d words, "
+					"got diff between parts of 1.0",
+					tw);
+
+				pdiff = rspamd_mempool_alloc(task->task_pool,
+											 sizeof(double));
+				*pdiff = 1.0;
+				rspamd_mempool_set_variable(task->task_pool,
+											"parts_distance",
+											pdiff,
+											NULL);
+				ptw = rspamd_mempool_alloc(task->task_pool,
+										   sizeof(int));
+				*ptw = tw;
+				rspamd_mempool_set_variable(task->task_pool,
+											"total_words",
+											ptw,
+											NULL);
+			}
 		}
 	}
 
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 19606a8c0f..c4be39d9a3 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -163,6 +163,7 @@ struct rspamd_mime_text_part {
 	rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */
 	GList *exceptions;                    /**< list of offsets of urls						*/
 	struct rspamd_mime_part *mime_part;
+	struct rspamd_mime_text_part *alt_text_part; /**< alternative text part (text for html, html for text) */
 
 	unsigned int flags;
 	unsigned int nlines;
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index 3302a0982d..d62c28735b 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -1470,21 +1470,26 @@ rspamd_has_only_html_part(struct rspamd_task *task, GArray *args,
 						  void *unused)
 {
 	struct rspamd_mime_text_part *p;
-	unsigned int i, cnt_html = 0, cnt_txt = 0;
+	unsigned int i;
 
+	/*
+	 * Return TRUE if there's any HTML part (not attachment) that has no
+	 * text/plain alternative in its multipart/alternative parent.
+	 * This properly handles nested structures like:
+	 * - multipart/mixed â multipart/related â text/html (no text/plain)
+	 * - multipart/alternative â text/plain + multipart/related â text/html
+	 */
 	PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p)
 	{
-		if (!IS_TEXT_PART_ATTACHMENT(p)) {
-			if (IS_TEXT_PART_HTML(p)) {
-				cnt_html++;
-			}
-			else {
-				cnt_txt++;
+		if (!IS_TEXT_PART_ATTACHMENT(p) && IS_TEXT_PART_HTML(p)) {
+			if (p->alt_text_part == NULL) {
+				/* HTML part with no text alternative */
+				return TRUE;
 			}
 		}
 	}
 
-	return (cnt_html > 0 && cnt_txt == 0);
+	return FALSE;
 }
 
 static gboolean
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 25b6815dbb..db60b24e2c 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -230,6 +230,13 @@ LUA_FUNCTION_DEF(textpart, get_fuzzy_hashes);
  * @return {mimepart} mimepart object
  */
 LUA_FUNCTION_DEF(textpart, get_mimepart);
+/***
+ * @method text_part:get_alt_part()
+ * Returns the alternative text part (text/plain for HTML, text/html for plain text)
+ * within the same multipart/alternative container. Returns nil if no alternative exists.
+ * @return {text_part|nil} alternative text part or nil
+ */
+LUA_FUNCTION_DEF(textpart, get_alt_part);
 
 /***
  * @method text_part:get_html_fuzzy_hashes(mempool)
@@ -260,6 +267,7 @@ static const struct luaL_reg textpartlib_m[] = {
 	LUA_INTERFACE_DEF(textpart, get_charset),
 	LUA_INTERFACE_DEF(textpart, get_languages),
 	LUA_INTERFACE_DEF(textpart, get_mimepart),
+	LUA_INTERFACE_DEF(textpart, get_alt_part),
 	LUA_INTERFACE_DEF(textpart, get_stats),
 	LUA_INTERFACE_DEF(textpart, get_fuzzy_hashes),
 	LUA_INTERFACE_DEF(textpart, get_html_fuzzy_hashes),
@@ -1616,6 +1624,27 @@ lua_textpart_get_mimepart(lua_State *L)
 	return 1;
 }
 
+static int
+lua_textpart_get_alt_part(lua_State *L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_mime_text_part *part = lua_check_textpart(L);
+	struct rspamd_mime_text_part **ppart;
+
+	if (part != NULL) {
+		if (part->alt_text_part != NULL) {
+			ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
+			rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
+			*ppart = part->alt_text_part;
+
+			return 1;
+		}
+	}
+
+	lua_pushnil(L);
+	return 1;
+}
+
 /***
  * @method mime_part:get_stats()
  * Returns a table with the following data:
diff --git a/test/functional/cases/001_merged/100_general.robot b/test/functional/cases/001_merged/100_general.robot
index 19ebb3e6ca..9456beaa36 100644
--- a/test/functional/cases/001_merged/100_general.robot
+++ b/test/functional/cases/001_merged/100_general.robot
@@ -6,6 +6,7 @@ Variables       ${RSPAMD_TESTDIR}/lib/vars.py
 *** Variables ***
 ${GTUBE}               ${RSPAMD_TESTDIR}/messages/gtube.eml
 ${ALT_RELATED}         ${RSPAMD_TESTDIR}/messages/alternative-related.eml
+${MIXED_RELATED_HTML}  ${RSPAMD_TESTDIR}/messages/mixed-related-html-only.eml
 ${SETTINGS_NOSYMBOLS}  {symbols_enabled = []}
 
 *** Test Cases ***
@@ -66,6 +67,11 @@ HTML ONLY - multipart/related inside alternative
   ...  Settings={symbols_enabled = [MIME_HTML_ONLY]}
   Do Not Expect Symbol  MIME_HTML_ONLY
 
+HTML ONLY - multipart/mixed with related (html only)
+  Scan File  ${MIXED_RELATED_HTML}
+  ...  Settings={symbols_enabled = [MIME_HTML_ONLY]}
+  Expect Symbol  MIME_HTML_ONLY
+
 PARTS DIFFER - multipart/related inside alternative
   Scan File  ${ALT_RELATED}
   ...  Settings={symbols_enabled = [R_PARTS_DIFFER]}
diff --git a/test/functional/messages/btc.eml b/test/functional/messages/btc.eml
index 840a5cbf1a..20ceb1dacc 100644
--- a/test/functional/messages/btc.eml
+++ b/test/functional/messages/btc.eml
@@ -5,7 +5,7 @@ Date: Mon, 27 Apr 2020 19:54:10 +0300
 Message-ID: <CA+1S=h4aGimA6vSBJF=t1F+5z-Mua5+Cimf+NU_NDWJk8ZNOcw@mail.gmail.com>
 Subject: Fwd:
 To: <test@test.ru>
-Content-Type: multipart/alternative; boundary="00000000000004de7805a4489190"
+Content-Type: multipart/alternative; boundary="0000000000004bee6805a4484c02"
 
 --0000000000004bee6805a4484c02
 Content-Type: text/plain; charset="UTF-8"
diff --git a/test/functional/messages/mixed-related-html-only.eml b/test/functional/messages/mixed-related-html-only.eml
new file mode 100644
index 0000000000..616137ffee
--- /dev/null
+++ b/test/functional/messages/mixed-related-html-only.eml
@@ -0,0 +1,35 @@
+Return-Path: test@test.com
+From: TEST <test@test.com>
+To: Me <me@me.me>
+Subject: multipart/mixed with related (html only)
+MIME-Version: 1.0
+Date: Mon, 01 Jan 2024 00:00:00 +0000
+Message-ID: <mixed-related-html-only@test.com>
+Content-Type: multipart/mixed; boundary="19//AYBSG9F/KH4LSBP="
+
+--19//AYBSG9F/KH4LSBP=
+Content-Type: multipart/related; boundary="29//AYBSG9F/KH4LSBP="
+
+--29//AYBSG9F/KH4LSBP=
+Content-Type: text/html; charset="UTF-8"
+
+<html>
+<body>
+This is an HTML-only message body.
+</body>
+</html>
+
+--29//AYBSG9F/KH4LSBP=
+Content-Type: image/jpeg; name="dc660250d1b3d161181bd661989c2a74.jpg"
+Content-ID: <dc660250d1b3d161181bd661989c2a74.jpg>
+
+dummy
+
+--29//AYBSG9F/KH4LSBP=--
+
+--19//AYBSG9F/KH4LSBP=
+Content-Type: application/pdf; name="Chamblisslaw Payroll Wages_Memo_1NZ7-CIRM1N-MGX6.pdf"
+
+dummy
+
+--19//AYBSG9F/KH4LSBP=--