[Fix] Rework alternative parts detection

author Vsevolod Stakhov <vsevolod@rspamd.com>

Wed, 4 Feb 2026 12:02:06 +0000 (12:02 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Wed, 4 Feb 2026 12:02:06 +0000 (12:02 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Wed, 4 Feb 2026 12:02:06 +0000 (12:02 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Wed, 4 Feb 2026 12:02:06 +0000 (12:02 +0000)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index cca5efebed585084f8b36afd87424e284590313f..3b8db306234c87ca93cb75ae7ea60a0da12841be 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -998,6 +998,121 @@ rspamd_mime_part_is_in_multipart_alternative(struct rspamd_mime_part *part)
                                                                                                                   "alternative", 11) != NULL;
  }
  
+/*
+ * Check if `ancestor` is an ancestor of `part` (or equal to it)
+ */
+static gboolean
+rspamd_mime_part_is_ancestor(struct rspamd_mime_part *part,
+                                                        struct rspamd_mime_part *ancestor)
+{
+       while (part) {
+               if (part == ancestor) {
+                       return TRUE;
+               }
+               part = part->parent_part;
+       }
+       return FALSE;
+}
+
+/*
+ * Recursively search for a text part in a subtree.
+ * - `root` is the multipart to search in
+ * - `exclude` is the subtree to skip (the branch containing the original part)
+ * - `want_html` TRUE means we want HTML part, FALSE means we want plain text
+ * - Returns the first matching text part, or NULL
+ * - Stops recursion if we hit a message/rfc822 part
+ */
+static struct rspamd_mime_text_part *
+rspamd_mime_part_find_text_in_subtree(struct rspamd_mime_part *root,
+                                                                         struct rspamd_mime_part *exclude,
+                                                                         gboolean want_html)
+{
+       if (!root || !IS_PART_MULTIPART(root) || !root->specific.mp) {
+               return NULL;
+       }
+
+       struct rspamd_mime_multipart *mp = root->specific.mp;
+
+       for (unsigned int i = 0; i < mp->children->len; i++) {
+               struct rspamd_mime_part *child = g_ptr_array_index(mp->children, i);
+
+               /* Skip the excluded subtree */
+               if (exclude && rspamd_mime_part_is_ancestor(exclude, child)) {
+                       continue;
+               }
+
+               /* Stop at message/rfc822 parts (embedded messages) */
+               if (IS_PART_MESSAGE(child)) {
+                       continue;
+               }
+
+               /* Check if this is the text part we want */
+               if (IS_PART_TEXT(child) && child->specific.txt) {
+                       struct rspamd_mime_text_part *txt = child->specific.txt;
+
+                       /* Skip attachments */
+                       if (IS_TEXT_PART_ATTACHMENT(txt)) {
+                               continue;
+                       }
+
+                       gboolean is_html = IS_TEXT_PART_HTML(txt);
+                       if (want_html == is_html) {
+                               return txt;
+                       }
+               }
+
+               /* Recurse into multiparts */
+               if (IS_PART_MULTIPART(child)) {
+                       struct rspamd_mime_text_part *found =
+                               rspamd_mime_part_find_text_in_subtree(child, NULL, want_html);
+                       if (found) {
+                               return found;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Find the alternative text part for a given text part.
+ * Algorithm:
+ * 1. Walk up from the part's mime_part to find a multipart/alternative ancestor
+ * 2. If found, identify which child subtree contains our part
+ * 3. Search other children for a text part of the opposite type
+ *    (plain for HTML, HTML for plain)
+ * 4. Stop if we hit message/rfc822 (embedded message)
+ */
+static struct rspamd_mime_text_part *
+rspamd_mime_text_part_find_alternative(struct rspamd_mime_text_part *text_part)
+{
+       if (!text_part || !text_part->mime_part) {
+               return NULL;
+       }
+
+       struct rspamd_mime_part *mime_part = text_part->mime_part;
+       gboolean is_html = IS_TEXT_PART_HTML(text_part);
+
+       /* Find the multipart/alternative ancestor */
+       struct rspamd_mime_part *alt_parent =
+               rspamd_mime_part_find_parent_multipart_subtype(mime_part->parent_part,
+                                                                                                          "alternative", 11);
+
+       if (!alt_parent) {
+               /* No alternative parent, no alternative part */
+               return NULL;
+       }
+
+       /* Find the direct child of alt_parent that contains our mime_part */
+       struct rspamd_mime_part *our_branch = mime_part;
+       while (our_branch->parent_part && our_branch->parent_part != alt_parent) {
+               our_branch = our_branch->parent_part;
+       }
+
+       /* Search other branches for a text part of the opposite type */
+       return rspamd_mime_part_find_text_in_subtree(alt_parent, our_branch, !is_html);
+}
+
  static enum rspamd_message_part_is_text_result
  rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task,
                                                                                   struct rspamd_mime_part *mime_part)
@@ -1855,134 +1970,117 @@ void rspamd_message_process(struct rspamd_task *task)
                 }
         }
  
-       /* Calculate distance for 2-parts messages */
-       if (i == 2) {
-               p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0);
-               p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1);
+       /* Compute alternative text parts for each text part */
+       PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
+       {
+               text_part->alt_text_part = rspamd_mime_text_part_find_alternative(text_part);
+       }
  
-               struct rspamd_mime_part *alt_parent1 = rspamd_mime_part_find_parent_multipart_subtype(
-                       p1->mime_part->parent_part, "alternative", 11);
-               struct rspamd_mime_part *alt_parent2 = rspamd_mime_part_find_parent_multipart_subtype(
-                       p2->mime_part->parent_part, "alternative", 11);
+       /* Calculate distance for alternative parts */
+       struct rspamd_mime_text_part *html_part = NULL;
  
-               /* We compare parts only if they belong to the same multipart/alternative container */
-               if (alt_parent1 && alt_parent1 == alt_parent2) {
-                       if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
-                               p1->normalized_hashes && p2->normalized_hashes) {
-                               /*
-                                        * We also detect language on one part and propagate it to
-                                        * another one
-                                        */
-                               struct rspamd_mime_text_part *sel;
-
-                               /* Prefer HTML as text part is not displayed normally */
-                               if (IS_TEXT_PART_HTML(p1)) {
-                                       sel = p1;
-                               }
-                               else if (IS_TEXT_PART_HTML(p2)) {
-                                       sel = p2;
-                               }
-                               else {
-                                       if (p1->utf_content.len > p2->utf_content.len) {
-                                               sel = p1;
-                                       }
-                                       else {
-                                               sel = p2;
-                                       }
-                               }
+       /* Find the first non-attachment HTML part that has an alternative */
+       PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
+       {
+               if (IS_TEXT_PART_HTML(text_part) &&
+                       !IS_TEXT_PART_ATTACHMENT(text_part) &&
+                       text_part->alt_text_part) {
+                       html_part = text_part;
+                       break;
+               }
+       }
  
-                               if (sel->language && sel->language[0]) {
-                                       /* Propagate language */
-                                       if (sel == p1) {
-                                               if (p2->languages) {
-                                                       g_ptr_array_unref(p2->languages);
-                                               }
+       if (html_part) {
+               p1 = html_part;
+               p2 = html_part->alt_text_part;
  
-                                               p2->language = sel->language;
-                                               p2->languages = g_ptr_array_ref(sel->languages);
-                                       }
-                                       else {
-                                               if (p1->languages) {
-                                                       g_ptr_array_unref(p1->languages);
-                                               }
+               /* alt_text_part already ensures they share a multipart/alternative parent */
+               if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
+                       p1->normalized_hashes && p2->normalized_hashes) {
+                       /*
+                        * We also detect language on one part and propagate it to
+                        * another one. Prefer HTML as text part is not displayed normally.
+                        */
+                       struct rspamd_mime_text_part *sel = p1; /* p1 is always HTML here */
  
-                                               p1->language = sel->language;
-                                               p1->languages = g_ptr_array_ref(sel->languages);
-                                       }
+                       if (sel->language && sel->language[0]) {
+                               /* Propagate language from HTML to text part */
+                               if (p2->languages) {
+                                       g_ptr_array_unref(p2->languages);
                                 }
  
-                               tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
+                               p2->language = sel->language;
+                               p2->languages = g_ptr_array_ref(sel->languages);
+                       }
  
-                               if (tw > 0) {
-                                       dw = rspamd_words_levenshtein_distance(task,
-                                                                                                                  p1->normalized_hashes,
-                                                                                                                  p2->normalized_hashes);
-                                       diff = dw / (double) tw;
+                       tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
  
-                                       msg_debug_task(
-                                               "different words: %d, total words: %d, "
-                                               "got diff between parts of %.2f",
-                                               dw, tw,
-                                               diff);
+                       if (tw > 0) {
+                               dw = rspamd_words_levenshtein_distance(task,
+                                                                                                          p1->normalized_hashes,
+                                                                                                          p2->normalized_hashes);
+                               diff = dw / (double) tw;
  
-                                       pdiff = rspamd_mempool_alloc(task->task_pool,
-                                                                                                sizeof(double));
-                                       *pdiff = diff;
-                                       rspamd_mempool_set_variable(task->task_pool,
-                                                                                               "parts_distance",
-                                                                                               pdiff,
-                                                                                               NULL);
-                                       ptw = rspamd_mempool_alloc(task->task_pool,
-                                                                                          sizeof(int));
-                                       *ptw = tw;
-                                       rspamd_mempool_set_variable(task->task_pool,
-                                                                                               "total_words",
-                                                                                               ptw,
-                                                                                               NULL);
-                               }
-                       }
-                       else {
-                               /*
-                                        * Handle cases where parts differ significantly:
-                                        * - One part is empty, another is not
-                                        * - One part has words, another has none (but isn't empty)
-                                        * In both cases, treat as 100% difference
-                                        */
-                               gboolean p1_has_words = p1->normalized_hashes &&
-                                                                               p1->normalized_hashes->len > 0;
-                               gboolean p2_has_words = p2->normalized_hashes &&
-                                                                               p2->normalized_hashes->len > 0;
-
-                               if (p1_has_words != p2_has_words) {
-                                       struct rspamd_mime_text_part *non_empty =
-                                               p1_has_words ? p1 : p2;
-
-                                       tw = non_empty->normalized_hashes->len;
-
-                                       msg_debug_task(
-                                               "one part has no words, another has %d words, "
-                                               "got diff between parts of 1.0",
-                                               tw);
-
-                                       pdiff = rspamd_mempool_alloc(task->task_pool,
-                                                                                                sizeof(double));
-                                       *pdiff = 1.0;
-                                       rspamd_mempool_set_variable(task->task_pool,
-                                                                                               "parts_distance",
-                                                                                               pdiff,
-                                                                                               NULL);
-                                       ptw = rspamd_mempool_alloc(task->task_pool,
-                                                                                          sizeof(int));
-                                       *ptw = tw;
-                                       rspamd_mempool_set_variable(task->task_pool,
-                                                                                               "total_words",
-                                                                                               ptw,
-                                                                                               NULL);
-                               }
+                               msg_debug_task(
+                                       "different words: %d, total words: %d, "
+                                       "got diff between parts of %.2f",
+                                       dw, tw,
+                                       diff);
+
+                               pdiff = rspamd_mempool_alloc(task->task_pool,
+                                                                                        sizeof(double));
+                               *pdiff = diff;
+                               rspamd_mempool_set_variable(task->task_pool,
+                                                                                       "parts_distance",
+                                                                                       pdiff,
+                                                                                       NULL);
+                               ptw = rspamd_mempool_alloc(task->task_pool,
+                                                                                  sizeof(int));
+                               *ptw = tw;
+                               rspamd_mempool_set_variable(task->task_pool,
+                                                                                       "total_words",
+                                                                                       ptw,
+                                                                                       NULL);
                         }
                 }
                 else {
-                       debug_task("message contains two parts but they are in different multi-parts");
+                       /*
+                        * Handle cases where parts differ significantly:
+                        * - One part is empty, another is not
+                        * - One part has words, another has none (but isn't empty)
+                        * In both cases, treat as 100% difference
+                        */
+                       gboolean p1_has_words = p1->normalized_hashes &&
+                                                                       p1->normalized_hashes->len > 0;
+                       gboolean p2_has_words = p2->normalized_hashes &&
+                                                                       p2->normalized_hashes->len > 0;
+
+                       if (p1_has_words != p2_has_words) {
+                               struct rspamd_mime_text_part *non_empty =
+                                       p1_has_words ? p1 : p2;
+
+                               tw = non_empty->normalized_hashes->len;
+
+                               msg_debug_task(
+                                       "one part has no words, another has %d words, "
+                                       "got diff between parts of 1.0",
+                                       tw);
+
+                               pdiff = rspamd_mempool_alloc(task->task_pool,
+                                                                                        sizeof(double));
+                               *pdiff = 1.0;
+                               rspamd_mempool_set_variable(task->task_pool,
+                                                                                       "parts_distance",
+                                                                                       pdiff,
+                                                                                       NULL);
+                               ptw = rspamd_mempool_alloc(task->task_pool,
+                                                                                  sizeof(int));
+                               *ptw = tw;
+                               rspamd_mempool_set_variable(task->task_pool,
+                                                                                       "total_words",
+                                                                                       ptw,
+                                                                                       NULL);
+                       }
                 }
         }
  
diff --git a/src/libmime/message.h b/src/libmime/message.h

index 19606a8c0fe41623a9f25b7f19859492960b74c5..c4be39d9a3c916ade9f8204cb36f0796e78be071 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -163,6 +163,7 @@ struct rspamd_mime_text_part {
         rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */
         GList *exceptions;                    /**< list of offsets of urls                                              */
         struct rspamd_mime_part *mime_part;
+       struct rspamd_mime_text_part *alt_text_part; /**< alternative text part (text for html, html for text) */
  
         unsigned int flags;
         unsigned int nlines;
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c

index 3302a0982d7f5b965966f3f1a8732079f43b5ab2..d62c28735bcb9b41563254cf38ebd8f038cbf17a 100644 (file)
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -1470,21 +1470,26 @@ rspamd_has_only_html_part(struct rspamd_task *task, GArray *args,
                                                   void *unused)
  {
         struct rspamd_mime_text_part *p;
-       unsigned int i, cnt_html = 0, cnt_txt = 0;
+       unsigned int i;
  
+       /*
+        * Return TRUE if there's any HTML part (not attachment) that has no
+        * text/plain alternative in its multipart/alternative parent.
+        * This properly handles nested structures like:
+        * - multipart/mixed → multipart/related → text/html (no text/plain)
+        * - multipart/alternative → text/plain + multipart/related → text/html
+        */
         PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p)
         {
-               if (!IS_TEXT_PART_ATTACHMENT(p)) {
-                       if (IS_TEXT_PART_HTML(p)) {
-                               cnt_html++;
-                       }
-                       else {
-                               cnt_txt++;
+               if (!IS_TEXT_PART_ATTACHMENT(p) && IS_TEXT_PART_HTML(p)) {
+                       if (p->alt_text_part == NULL) {
+                               /* HTML part with no text alternative */
+                               return TRUE;
                         }
                 }
         }
  
-       return (cnt_html > 0 && cnt_txt == 0);
+       return FALSE;
  }
  
  static gboolean
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c

index 25b6815dbb282024853c0adbef461ca72d76a4f4..db60b24e2cb930f6ad92cc53559cd3b42c747cd5 100644 (file)
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -230,6 +230,13 @@ LUA_FUNCTION_DEF(textpart, get_fuzzy_hashes);
   * @return {mimepart} mimepart object
   */
  LUA_FUNCTION_DEF(textpart, get_mimepart);
+/***
+ * @method text_part:get_alt_part()
+ * Returns the alternative text part (text/plain for HTML, text/html for plain text)
+ * within the same multipart/alternative container. Returns nil if no alternative exists.
+ * @return {text_part|nil} alternative text part or nil
+ */
+LUA_FUNCTION_DEF(textpart, get_alt_part);
  
  /***
   * @method text_part:get_html_fuzzy_hashes(mempool)
@@ -260,6 +267,7 @@ static const struct luaL_reg textpartlib_m[] = {
         LUA_INTERFACE_DEF(textpart, get_charset),
         LUA_INTERFACE_DEF(textpart, get_languages),
         LUA_INTERFACE_DEF(textpart, get_mimepart),
+       LUA_INTERFACE_DEF(textpart, get_alt_part),
         LUA_INTERFACE_DEF(textpart, get_stats),
         LUA_INTERFACE_DEF(textpart, get_fuzzy_hashes),
         LUA_INTERFACE_DEF(textpart, get_html_fuzzy_hashes),
@@ -1616,6 +1624,27 @@ lua_textpart_get_mimepart(lua_State *L)
         return 1;
  }
  
+static int
+lua_textpart_get_alt_part(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_mime_text_part *part = lua_check_textpart(L);
+       struct rspamd_mime_text_part **ppart;
+
+       if (part != NULL) {
+               if (part->alt_text_part != NULL) {
+                       ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
+                       rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
+                       *ppart = part->alt_text_part;
+
+                       return 1;
+               }
+       }
+
+       lua_pushnil(L);
+       return 1;
+}
+
  /***
   * @method mime_part:get_stats()
   * Returns a table with the following data:
diff --git a/test/functional/cases/001_merged/100_general.robot b/test/functional/cases/001_merged/100_general.robot

index 19ebb3e6ca3117a0b99555aad754e697fd9aa812..9456beaa3635340ce38dba24d7e2640092ec4474 100644 (file)
--- a/test/functional/cases/001_merged/100_general.robot
+++ b/test/functional/cases/001_merged/100_general.robot
@@ -6,6 +6,7 @@ Variables       ${RSPAMD_TESTDIR}/lib/vars.py
  *** Variables ***
  ${GTUBE}               ${RSPAMD_TESTDIR}/messages/gtube.eml
  ${ALT_RELATED}         ${RSPAMD_TESTDIR}/messages/alternative-related.eml
+${MIXED_RELATED_HTML}  ${RSPAMD_TESTDIR}/messages/mixed-related-html-only.eml
  ${SETTINGS_NOSYMBOLS}  {symbols_enabled = []}
  
  *** Test Cases ***
@@ -66,6 +67,11 @@ HTML ONLY - multipart/related inside alternative
    ...  Settings={symbols_enabled = [MIME_HTML_ONLY]}
    Do Not Expect Symbol  MIME_HTML_ONLY
  
+HTML ONLY - multipart/mixed with related (html only)
+  Scan File  ${MIXED_RELATED_HTML}
+  ...  Settings={symbols_enabled = [MIME_HTML_ONLY]}
+  Expect Symbol  MIME_HTML_ONLY
+
  PARTS DIFFER - multipart/related inside alternative
    Scan File  ${ALT_RELATED}
    ...  Settings={symbols_enabled = [R_PARTS_DIFFER]}
diff --git a/test/functional/messages/btc.eml b/test/functional/messages/btc.eml

index 840a5cbf1ac1d3bde90581dd111697953dd1b5b4..20ceb1daccccee8b331582d3b575aa845b373551 100644 (file)
--- a/test/functional/messages/btc.eml
+++ b/test/functional/messages/btc.eml
@@ -5,7 +5,7 @@ Date: Mon, 27 Apr 2020 19:54:10 +0300
  Message-ID: <CA+1S=h4aGimA6vSBJF=t1F+5z-Mua5+Cimf+NU_NDWJk8ZNOcw@mail.gmail.com>
  Subject: Fwd:
  To: <test@test.ru>
-Content-Type: multipart/alternative; boundary="00000000000004de7805a4489190"
+Content-Type: multipart/alternative; boundary="0000000000004bee6805a4484c02"
  
  --0000000000004bee6805a4484c02
  Content-Type: text/plain; charset="UTF-8"
diff --git a/test/functional/messages/mixed-related-html-only.eml b/test/functional/messages/mixed-related-html-only.eml

new file mode 100644 (file)

index 0000000..616137f
--- /dev/null
+++ b/test/functional/messages/mixed-related-html-only.eml
@@ -0,0 +1,35 @@
+Return-Path: test@test.com
+From: TEST <test@test.com>
+To: Me <me@me.me>
+Subject: multipart/mixed with related (html only)
+MIME-Version: 1.0
+Date: Mon, 01 Jan 2024 00:00:00 +0000
+Message-ID: <mixed-related-html-only@test.com>
+Content-Type: multipart/mixed; boundary="19//AYBSG9F/KH4LSBP="
+
+--19//AYBSG9F/KH4LSBP=
+Content-Type: multipart/related; boundary="29//AYBSG9F/KH4LSBP="
+
+--29//AYBSG9F/KH4LSBP=
+Content-Type: text/html; charset="UTF-8"
+
+<html>
+<body>
+This is an HTML-only message body.
+</body>
+</html>
+
+--29//AYBSG9F/KH4LSBP=
+Content-Type: image/jpeg; name="dc660250d1b3d161181bd661989c2a74.jpg"
+Content-ID: <dc660250d1b3d161181bd661989c2a74.jpg>
+
+dummy
+
+--29//AYBSG9F/KH4LSBP=--
+
+--19//AYBSG9F/KH4LSBP=
+Content-Type: application/pdf; name="Chamblisslaw Payroll Wages_Memo_1NZ7-CIRM1N-MGX6.pdf"
+
+dummy
+
+--19//AYBSG9F/KH4LSBP=--
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Wed, 4 Feb 2026 12:02:06 +0000 (12:02 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Wed, 4 Feb 2026 12:02:06 +0000 (12:02 +0000)
src/libmime/message.c		patch \| blob \| blame \| history
src/libmime/message.h		patch \| blob \| blame \| history
src/libmime/mime_expressions.c		patch \| blob \| blame \| history
src/lua/lua_mimepart.c		patch \| blob \| blame \| history
test/functional/cases/001_merged/100_general.robot		patch \| blob \| blame \| history
test/functional/messages/btc.eml		patch \| blob \| blame \| history
test/functional/messages/mixed-related-html-only.eml	[new file with mode: 0644]	patch \| blob