From: Vsevolod Stakhov Date: Wed, 4 Feb 2026 12:02:06 +0000 (+0000) Subject: [Fix] Rework alternative parts detection X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ba35d63be787c3d76f04b0604a83b033f3af5efe;p=thirdparty%2Frspamd.git [Fix] Rework alternative parts detection --- diff --git a/src/libmime/message.c b/src/libmime/message.c index cca5efebed..3b8db30623 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -998,6 +998,121 @@ rspamd_mime_part_is_in_multipart_alternative(struct rspamd_mime_part *part) "alternative", 11) != NULL; } +/* + * Check if `ancestor` is an ancestor of `part` (or equal to it) + */ +static gboolean +rspamd_mime_part_is_ancestor(struct rspamd_mime_part *part, + struct rspamd_mime_part *ancestor) +{ + while (part) { + if (part == ancestor) { + return TRUE; + } + part = part->parent_part; + } + return FALSE; +} + +/* + * Recursively search for a text part in a subtree. + * - `root` is the multipart to search in + * - `exclude` is the subtree to skip (the branch containing the original part) + * - `want_html` TRUE means we want HTML part, FALSE means we want plain text + * - Returns the first matching text part, or NULL + * - Stops recursion if we hit a message/rfc822 part + */ +static struct rspamd_mime_text_part * +rspamd_mime_part_find_text_in_subtree(struct rspamd_mime_part *root, + struct rspamd_mime_part *exclude, + gboolean want_html) +{ + if (!root || !IS_PART_MULTIPART(root) || !root->specific.mp) { + return NULL; + } + + struct rspamd_mime_multipart *mp = root->specific.mp; + + for (unsigned int i = 0; i < mp->children->len; i++) { + struct rspamd_mime_part *child = g_ptr_array_index(mp->children, i); + + /* Skip the excluded subtree */ + if (exclude && rspamd_mime_part_is_ancestor(exclude, child)) { + continue; + } + + /* Stop at message/rfc822 parts (embedded messages) */ + if (IS_PART_MESSAGE(child)) { + continue; + } + + /* Check if this is the text part we want */ + if (IS_PART_TEXT(child) && child->specific.txt) { + struct rspamd_mime_text_part *txt = child->specific.txt; + + /* Skip attachments */ + if (IS_TEXT_PART_ATTACHMENT(txt)) { + continue; + } + + gboolean is_html = IS_TEXT_PART_HTML(txt); + if (want_html == is_html) { + return txt; + } + } + + /* Recurse into multiparts */ + if (IS_PART_MULTIPART(child)) { + struct rspamd_mime_text_part *found = + rspamd_mime_part_find_text_in_subtree(child, NULL, want_html); + if (found) { + return found; + } + } + } + + return NULL; +} + +/* + * Find the alternative text part for a given text part. + * Algorithm: + * 1. Walk up from the part's mime_part to find a multipart/alternative ancestor + * 2. If found, identify which child subtree contains our part + * 3. Search other children for a text part of the opposite type + * (plain for HTML, HTML for plain) + * 4. Stop if we hit message/rfc822 (embedded message) + */ +static struct rspamd_mime_text_part * +rspamd_mime_text_part_find_alternative(struct rspamd_mime_text_part *text_part) +{ + if (!text_part || !text_part->mime_part) { + return NULL; + } + + struct rspamd_mime_part *mime_part = text_part->mime_part; + gboolean is_html = IS_TEXT_PART_HTML(text_part); + + /* Find the multipart/alternative ancestor */ + struct rspamd_mime_part *alt_parent = + rspamd_mime_part_find_parent_multipart_subtype(mime_part->parent_part, + "alternative", 11); + + if (!alt_parent) { + /* No alternative parent, no alternative part */ + return NULL; + } + + /* Find the direct child of alt_parent that contains our mime_part */ + struct rspamd_mime_part *our_branch = mime_part; + while (our_branch->parent_part && our_branch->parent_part != alt_parent) { + our_branch = our_branch->parent_part; + } + + /* Search other branches for a text part of the opposite type */ + return rspamd_mime_part_find_text_in_subtree(alt_parent, our_branch, !is_html); +} + static enum rspamd_message_part_is_text_result rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task, struct rspamd_mime_part *mime_part) @@ -1855,134 +1970,117 @@ void rspamd_message_process(struct rspamd_task *task) } } - /* Calculate distance for 2-parts messages */ - if (i == 2) { - p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0); - p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1); + /* Compute alternative text parts for each text part */ + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + text_part->alt_text_part = rspamd_mime_text_part_find_alternative(text_part); + } - struct rspamd_mime_part *alt_parent1 = rspamd_mime_part_find_parent_multipart_subtype( - p1->mime_part->parent_part, "alternative", 11); - struct rspamd_mime_part *alt_parent2 = rspamd_mime_part_find_parent_multipart_subtype( - p2->mime_part->parent_part, "alternative", 11); + /* Calculate distance for alternative parts */ + struct rspamd_mime_text_part *html_part = NULL; - /* We compare parts only if they belong to the same multipart/alternative container */ - if (alt_parent1 && alt_parent1 == alt_parent2) { - if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) && - p1->normalized_hashes && p2->normalized_hashes) { - /* - * We also detect language on one part and propagate it to - * another one - */ - struct rspamd_mime_text_part *sel; - - /* Prefer HTML as text part is not displayed normally */ - if (IS_TEXT_PART_HTML(p1)) { - sel = p1; - } - else if (IS_TEXT_PART_HTML(p2)) { - sel = p2; - } - else { - if (p1->utf_content.len > p2->utf_content.len) { - sel = p1; - } - else { - sel = p2; - } - } + /* Find the first non-attachment HTML part that has an alternative */ + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (IS_TEXT_PART_HTML(text_part) && + !IS_TEXT_PART_ATTACHMENT(text_part) && + text_part->alt_text_part) { + html_part = text_part; + break; + } + } - if (sel->language && sel->language[0]) { - /* Propagate language */ - if (sel == p1) { - if (p2->languages) { - g_ptr_array_unref(p2->languages); - } + if (html_part) { + p1 = html_part; + p2 = html_part->alt_text_part; - p2->language = sel->language; - p2->languages = g_ptr_array_ref(sel->languages); - } - else { - if (p1->languages) { - g_ptr_array_unref(p1->languages); - } + /* alt_text_part already ensures they share a multipart/alternative parent */ + if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) && + p1->normalized_hashes && p2->normalized_hashes) { + /* + * We also detect language on one part and propagate it to + * another one. Prefer HTML as text part is not displayed normally. + */ + struct rspamd_mime_text_part *sel = p1; /* p1 is always HTML here */ - p1->language = sel->language; - p1->languages = g_ptr_array_ref(sel->languages); - } + if (sel->language && sel->language[0]) { + /* Propagate language from HTML to text part */ + if (p2->languages) { + g_ptr_array_unref(p2->languages); } - tw = p1->normalized_hashes->len + p2->normalized_hashes->len; + p2->language = sel->language; + p2->languages = g_ptr_array_ref(sel->languages); + } - if (tw > 0) { - dw = rspamd_words_levenshtein_distance(task, - p1->normalized_hashes, - p2->normalized_hashes); - diff = dw / (double) tw; + tw = p1->normalized_hashes->len + p2->normalized_hashes->len; - msg_debug_task( - "different words: %d, total words: %d, " - "got diff between parts of %.2f", - dw, tw, - diff); + if (tw > 0) { + dw = rspamd_words_levenshtein_distance(task, + p1->normalized_hashes, + p2->normalized_hashes); + diff = dw / (double) tw; - pdiff = rspamd_mempool_alloc(task->task_pool, - sizeof(double)); - *pdiff = diff; - rspamd_mempool_set_variable(task->task_pool, - "parts_distance", - pdiff, - NULL); - ptw = rspamd_mempool_alloc(task->task_pool, - sizeof(int)); - *ptw = tw; - rspamd_mempool_set_variable(task->task_pool, - "total_words", - ptw, - NULL); - } - } - else { - /* - * Handle cases where parts differ significantly: - * - One part is empty, another is not - * - One part has words, another has none (but isn't empty) - * In both cases, treat as 100% difference - */ - gboolean p1_has_words = p1->normalized_hashes && - p1->normalized_hashes->len > 0; - gboolean p2_has_words = p2->normalized_hashes && - p2->normalized_hashes->len > 0; - - if (p1_has_words != p2_has_words) { - struct rspamd_mime_text_part *non_empty = - p1_has_words ? p1 : p2; - - tw = non_empty->normalized_hashes->len; - - msg_debug_task( - "one part has no words, another has %d words, " - "got diff between parts of 1.0", - tw); - - pdiff = rspamd_mempool_alloc(task->task_pool, - sizeof(double)); - *pdiff = 1.0; - rspamd_mempool_set_variable(task->task_pool, - "parts_distance", - pdiff, - NULL); - ptw = rspamd_mempool_alloc(task->task_pool, - sizeof(int)); - *ptw = tw; - rspamd_mempool_set_variable(task->task_pool, - "total_words", - ptw, - NULL); - } + msg_debug_task( + "different words: %d, total words: %d, " + "got diff between parts of %.2f", + dw, tw, + diff); + + pdiff = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *pdiff = diff; + rspamd_mempool_set_variable(task->task_pool, + "parts_distance", + pdiff, + NULL); + ptw = rspamd_mempool_alloc(task->task_pool, + sizeof(int)); + *ptw = tw; + rspamd_mempool_set_variable(task->task_pool, + "total_words", + ptw, + NULL); } } else { - debug_task("message contains two parts but they are in different multi-parts"); + /* + * Handle cases where parts differ significantly: + * - One part is empty, another is not + * - One part has words, another has none (but isn't empty) + * In both cases, treat as 100% difference + */ + gboolean p1_has_words = p1->normalized_hashes && + p1->normalized_hashes->len > 0; + gboolean p2_has_words = p2->normalized_hashes && + p2->normalized_hashes->len > 0; + + if (p1_has_words != p2_has_words) { + struct rspamd_mime_text_part *non_empty = + p1_has_words ? p1 : p2; + + tw = non_empty->normalized_hashes->len; + + msg_debug_task( + "one part has no words, another has %d words, " + "got diff between parts of 1.0", + tw); + + pdiff = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *pdiff = 1.0; + rspamd_mempool_set_variable(task->task_pool, + "parts_distance", + pdiff, + NULL); + ptw = rspamd_mempool_alloc(task->task_pool, + sizeof(int)); + *ptw = tw; + rspamd_mempool_set_variable(task->task_pool, + "total_words", + ptw, + NULL); + } } } diff --git a/src/libmime/message.h b/src/libmime/message.h index 19606a8c0f..c4be39d9a3 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -163,6 +163,7 @@ struct rspamd_mime_text_part { rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */ GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; + struct rspamd_mime_text_part *alt_text_part; /**< alternative text part (text for html, html for text) */ unsigned int flags; unsigned int nlines; diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index 3302a0982d..d62c28735b 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -1470,21 +1470,26 @@ rspamd_has_only_html_part(struct rspamd_task *task, GArray *args, void *unused) { struct rspamd_mime_text_part *p; - unsigned int i, cnt_html = 0, cnt_txt = 0; + unsigned int i; + /* + * Return TRUE if there's any HTML part (not attachment) that has no + * text/plain alternative in its multipart/alternative parent. + * This properly handles nested structures like: + * - multipart/mixed → multipart/related → text/html (no text/plain) + * - multipart/alternative → text/plain + multipart/related → text/html + */ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, p) { - if (!IS_TEXT_PART_ATTACHMENT(p)) { - if (IS_TEXT_PART_HTML(p)) { - cnt_html++; - } - else { - cnt_txt++; + if (!IS_TEXT_PART_ATTACHMENT(p) && IS_TEXT_PART_HTML(p)) { + if (p->alt_text_part == NULL) { + /* HTML part with no text alternative */ + return TRUE; } } } - return (cnt_html > 0 && cnt_txt == 0); + return FALSE; } static gboolean diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 25b6815dbb..db60b24e2c 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -230,6 +230,13 @@ LUA_FUNCTION_DEF(textpart, get_fuzzy_hashes); * @return {mimepart} mimepart object */ LUA_FUNCTION_DEF(textpart, get_mimepart); +/*** + * @method text_part:get_alt_part() + * Returns the alternative text part (text/plain for HTML, text/html for plain text) + * within the same multipart/alternative container. Returns nil if no alternative exists. + * @return {text_part|nil} alternative text part or nil + */ +LUA_FUNCTION_DEF(textpart, get_alt_part); /*** * @method text_part:get_html_fuzzy_hashes(mempool) @@ -260,6 +267,7 @@ static const struct luaL_reg textpartlib_m[] = { LUA_INTERFACE_DEF(textpart, get_charset), LUA_INTERFACE_DEF(textpart, get_languages), LUA_INTERFACE_DEF(textpart, get_mimepart), + LUA_INTERFACE_DEF(textpart, get_alt_part), LUA_INTERFACE_DEF(textpart, get_stats), LUA_INTERFACE_DEF(textpart, get_fuzzy_hashes), LUA_INTERFACE_DEF(textpart, get_html_fuzzy_hashes), @@ -1616,6 +1624,27 @@ lua_textpart_get_mimepart(lua_State *L) return 1; } +static int +lua_textpart_get_alt_part(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_mime_text_part *part = lua_check_textpart(L); + struct rspamd_mime_text_part **ppart; + + if (part != NULL) { + if (part->alt_text_part != NULL) { + ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *)); + rspamd_lua_setclass(L, rspamd_textpart_classname, -1); + *ppart = part->alt_text_part; + + return 1; + } + } + + lua_pushnil(L); + return 1; +} + /*** * @method mime_part:get_stats() * Returns a table with the following data: diff --git a/test/functional/cases/001_merged/100_general.robot b/test/functional/cases/001_merged/100_general.robot index 19ebb3e6ca..9456beaa36 100644 --- a/test/functional/cases/001_merged/100_general.robot +++ b/test/functional/cases/001_merged/100_general.robot @@ -6,6 +6,7 @@ Variables ${RSPAMD_TESTDIR}/lib/vars.py *** Variables *** ${GTUBE} ${RSPAMD_TESTDIR}/messages/gtube.eml ${ALT_RELATED} ${RSPAMD_TESTDIR}/messages/alternative-related.eml +${MIXED_RELATED_HTML} ${RSPAMD_TESTDIR}/messages/mixed-related-html-only.eml ${SETTINGS_NOSYMBOLS} {symbols_enabled = []} *** Test Cases *** @@ -66,6 +67,11 @@ HTML ONLY - multipart/related inside alternative ... Settings={symbols_enabled = [MIME_HTML_ONLY]} Do Not Expect Symbol MIME_HTML_ONLY +HTML ONLY - multipart/mixed with related (html only) + Scan File ${MIXED_RELATED_HTML} + ... Settings={symbols_enabled = [MIME_HTML_ONLY]} + Expect Symbol MIME_HTML_ONLY + PARTS DIFFER - multipart/related inside alternative Scan File ${ALT_RELATED} ... Settings={symbols_enabled = [R_PARTS_DIFFER]} diff --git a/test/functional/messages/btc.eml b/test/functional/messages/btc.eml index 840a5cbf1a..20ceb1dacc 100644 --- a/test/functional/messages/btc.eml +++ b/test/functional/messages/btc.eml @@ -5,7 +5,7 @@ Date: Mon, 27 Apr 2020 19:54:10 +0300 Message-ID: Subject: Fwd: To: -Content-Type: multipart/alternative; boundary="00000000000004de7805a4489190" +Content-Type: multipart/alternative; boundary="0000000000004bee6805a4484c02" --0000000000004bee6805a4484c02 Content-Type: text/plain; charset="UTF-8" diff --git a/test/functional/messages/mixed-related-html-only.eml b/test/functional/messages/mixed-related-html-only.eml new file mode 100644 index 0000000000..616137ffee --- /dev/null +++ b/test/functional/messages/mixed-related-html-only.eml @@ -0,0 +1,35 @@ +Return-Path: test@test.com +From: TEST +To: Me +Subject: multipart/mixed with related (html only) +MIME-Version: 1.0 +Date: Mon, 01 Jan 2024 00:00:00 +0000 +Message-ID: +Content-Type: multipart/mixed; boundary="19//AYBSG9F/KH4LSBP=" + +--19//AYBSG9F/KH4LSBP= +Content-Type: multipart/related; boundary="29//AYBSG9F/KH4LSBP=" + +--29//AYBSG9F/KH4LSBP= +Content-Type: text/html; charset="UTF-8" + + + +This is an HTML-only message body. + + + +--29//AYBSG9F/KH4LSBP= +Content-Type: image/jpeg; name="dc660250d1b3d161181bd661989c2a74.jpg" +Content-ID: + +dummy + +--29//AYBSG9F/KH4LSBP=-- + +--19//AYBSG9F/KH4LSBP= +Content-Type: application/pdf; name="Chamblisslaw Payroll Wages_Memo_1NZ7-CIRM1N-MGX6.pdf" + +dummy + +--19//AYBSG9F/KH4LSBP=--