]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add order to urls structure
authorVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Jul 2023 14:56:43 +0000 (15:56 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 25 Jul 2023 15:05:06 +0000 (16:05 +0100)
src/libmime/message.c
src/libserver/html/html.cxx
src/libserver/html/html.h
src/libserver/html/html.hxx
src/libserver/html/html_tests.cxx
src/libserver/url.c
src/libserver/url.h

index 5ab712283c735531bc0a5555b0eb94dd4b76a914..508ea27ea48ad97fe16e21666e4e3eecd2aa7234 100644 (file)
@@ -763,7 +763,8 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
 
 static gboolean
 rspamd_message_process_html_text_part (struct rspamd_task *task,
-                                                                               struct rspamd_mime_text_part *text_part)
+                                                                          struct rspamd_mime_text_part *text_part,
+                                                                          uint16_t *cur_url_order)
 {
        text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
 
@@ -786,7 +787,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
                        &text_part->exceptions,
                        MESSAGE_FIELD (task, urls),
                        text_part->mime_part->urls,
-                       task->cfg ? task->cfg->enable_css_parser : true);
+                       task->cfg ? task->cfg->enable_css_parser : true,
+                       cur_url_order);
        rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
 
        if (text_part->utf_content.len == 0) {
@@ -842,7 +844,8 @@ rspamd_message_part_can_be_parsed_as_text (struct rspamd_task *task,
 static gboolean
 rspamd_message_process_text_part_maybe (struct rspamd_task *task,
                                                                                struct rspamd_mime_part *mime_part,
-                                                                               enum rspamd_message_part_is_text_result is_text)
+                                                                               enum rspamd_message_part_is_text_result is_text,
+                                                                               uint16_t *cur_url_order)
 {
        struct rspamd_mime_text_part *text_part;
        guint flags = 0;
@@ -864,7 +867,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
        text_part->flags |= flags;
 
        if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
-               if (!rspamd_message_process_html_text_part (task, text_part)) {
+               if (!rspamd_message_process_html_text_part (task, text_part, cur_url_order)) {
                        return FALSE;
                }
        }
@@ -911,14 +914,14 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
                                 * Use strict extraction mode: we will extract missing urls from
                                 * an html part if needed
                                 */
-                               rspamd_url_text_extract (task->task_pool, task, text_part,
+                               rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
                                                RSPAMD_URL_FIND_STRICT);
                        }
                        else {
                                /*
                                 * Fall back to full text extraction using TLD patterns
                                 */
-                               rspamd_url_text_extract (task->task_pool, task, text_part,
+                               rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
                                                RSPAMD_URL_FIND_ALL);
                        }
                }
@@ -926,12 +929,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
                        /*
                         * Fall back to full text extraction using TLD patterns
                        */
-                       rspamd_url_text_extract (task->task_pool, task, text_part,
+                       rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
                                        RSPAMD_URL_FIND_ALL);
                }
        }
        else {
-               rspamd_url_text_extract (task->task_pool, task, text_part,
+               rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
                                RSPAMD_URL_FIND_STRICT);
        }
 
@@ -1487,13 +1490,14 @@ rspamd_message_process (struct rspamd_task *task)
                }
        }
 
+       uint16_t cur_url_order = 0;
        g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
        /* One more iteration to process text parts in a more specific order */
        for (i = 0; i < detected_text_parts->len; i ++) {
                part = g_ptr_array_index (MESSAGE_FIELD (task, parts),
                        g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
                rspamd_message_process_text_part_maybe(task, part,
-                       g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res);
+                       g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order);
        }
 
        g_array_free (detected_text_parts, TRUE);
@@ -1640,7 +1644,6 @@ rspamd_message_process (struct rspamd_task *task)
        }
 
        rspamd_images_link (task);
-
        rspamd_tokenize_meta_words (task);
 }
 
index a848a25d3fa04d98b95bdd2924b61d450d0f1fd9..ed034b9286baf5776847eb5c008bac751d942b82 100644 (file)
@@ -1328,7 +1328,8 @@ html_process_input(struct rspamd_task *task,
                                   GList **exceptions,
                                   khash_t (rspamd_url_hash) *url_set,
                                   GPtrArray *part_urls,
-                                  bool allow_css) -> html_content *
+                                  bool allow_css,
+                                  std::uint16_t *cur_url_order) -> html_content *
 {
        const gchar *p, *c, *end, *start;
        guchar t;
@@ -1372,6 +1373,7 @@ html_process_input(struct rspamd_task *task,
        g_assert (task != NULL);
 
        auto *pool = task->task_pool;
+       auto cur_url_part_order = 0u;
 
        auto *hc = new html_content;
        rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
@@ -1472,6 +1474,10 @@ html_process_input(struct rspamd_task *task,
                                        struct rspamd_url *maybe_existing =
                                                        rspamd_url_set_add_or_return(url_set, maybe_url.value());
                                        if (maybe_existing == maybe_url.value()) {
+                                               if (cur_url_order) {
+                                                       url->order = *(cur_url_order)++;
+                                               }
+                                               url->part_order = cur_url_part_order++;
                                                html_process_query_url(pool, url, url_set,
                                                                part_urls);
                                        }
@@ -2273,10 +2279,11 @@ rspamd_html_process_part_full(struct rspamd_task *task,
                                                          GByteArray *in, GList **exceptions,
                                                          khash_t (rspamd_url_hash) *url_set,
                                                          GPtrArray *part_urls,
-                                                         bool allow_css)
+                                                         bool allow_css,
+                                                         uint16_t *cur_url_order)
 {
        return rspamd::html::html_process_input(task, in, exceptions, url_set,
-                       part_urls, allow_css);
+                       part_urls, allow_css, cur_url_order);
 }
 
 void *
@@ -2286,9 +2293,10 @@ rspamd_html_process_part(rspamd_mempool_t *pool,
        struct rspamd_task fake_task;
        memset(&fake_task, 0, sizeof(fake_task));
        fake_task.task_pool = pool;
+       uint16_t order = 0;
 
        return rspamd_html_process_part_full (&fake_task, in, NULL,
-                       NULL, NULL, FALSE);
+                       NULL, NULL, FALSE, &order);
 }
 
 guint
index 2a43223f92d0cced68b2b05305edf8a99c1fafde..17067b3b19141c7f3f816a61aee6de1c9cd58411 100644 (file)
@@ -74,7 +74,8 @@ void *rspamd_html_process_part_full(struct rspamd_task *task,
                                                                        GByteArray *in, GList **exceptions,
                                                                        khash_t (rspamd_url_hash) *url_set,
                                                                        GPtrArray *part_urls,
-                                                                       bool allow_css);
+                                                                       bool allow_css,
+                                                                       uint16_t *cur_url_order);
 
 /*
  * Returns true if a specified tag has been seen in a part
index 34008aaf7566b767d813052258bf08b96058c0dc..c119adc3fcf5fb12cfdea929c6521c6d99898a7e 100644 (file)
@@ -132,7 +132,8 @@ auto html_process_input(struct rspamd_task *task,
                                   GList **exceptions,
                                   khash_t (rspamd_url_hash) *url_set,
                                   GPtrArray *part_urls,
-                                  bool allow_css) -> html_content *;
+                                  bool allow_css,
+                                  std::uint16_t *cur_url_order) -> html_content *;
 auto html_debug_structure(const html_content &hc) -> std::string;
 
 }
index 9ab13ee7840a90876a5df276152f736ff66c744e..2492337bfda02c2e1d47cf5a8bcdb53b1b573f06 100644 (file)
@@ -58,7 +58,7 @@ TEST_CASE("html parsing")
                SUBCASE((std::string("extract tags from: ") + c.first).c_str()) {
                        GByteArray *tmp = g_byte_array_sized_new(c.first.size());
                        g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
-                       auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true);
+                       auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
                        CHECK(hc != nullptr);
                        auto dump = html_debug_structure(*hc);
                        CHECK(c.second == dump);
@@ -215,7 +215,7 @@ TEST_CASE("html text extraction")
                SUBCASE((fmt::format("html extraction case {}", i)).c_str()) {
                        GByteArray *tmp = g_byte_array_sized_new(c.first.size());
                        g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
-                       auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true);
+                       auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
                        CHECK(hc != nullptr);
                        replace_newlines(hc->parsed);
                        auto expected = c.second;
@@ -259,7 +259,7 @@ TEST_CASE("html urls extraction")
                        auto input = std::get<0>(c);
                        GByteArray *tmp = g_byte_array_sized_new(input.size());
                        g_byte_array_append(tmp, (const guint8 *)input.data(), input.size());
-                       auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true);
+                       auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr);
                        CHECK(hc != nullptr);
                        auto &expected_text = std::get<2>(c);
                        if (expected_text.has_value()) {
index 33198b8613f4d12a001623cdb3b29969ddf4702b..d5dafeaeabde6c3c51f783eeed551d22cb1cd93f 100644 (file)
@@ -2244,6 +2244,9 @@ rspamd_url_parse (struct rspamd_url *uri,
        memset (uri, 0, sizeof (*uri));
        memset (&u, 0, sizeof (u));
        uri->count = 1;
+       /* Undefine order */
+       uri->order = -1;
+       uri->part_order = -1;
 
        if (*uristring == '\0') {
                return URI_ERRNO_EMPTY;
@@ -3453,6 +3456,8 @@ struct rspamd_url_mimepart_cbdata {
        struct rspamd_task *task;
        struct rspamd_mime_text_part *part;
        gsize url_len;
+       uint16_t *cur_url_order; /* Global ordering */
+       uint16_t cur_part_order; /* Per part ordering */
 };
 
 static gboolean
@@ -3488,6 +3493,12 @@ rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset,
                if (cbd->part && cbd->part->mime_part->urls) {
                        g_ptr_array_add (cbd->part->mime_part->urls, url);
                }
+
+               url->part_order = cbd->cur_part_order ++;
+
+               if (cbd->cur_url_order) {
+                       url->order = *(cbd->cur_url_order)++;
+               }
        }
 
        return TRUE;
@@ -3542,6 +3553,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 
        if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false) &&
                cbd->part->mime_part->urls) {
+               url->part_order = cbd->cur_part_order ++;
+
+               if (cbd->cur_url_order) {
+                       url->order = *(cbd->cur_url_order)++;
+               }
                g_ptr_array_add (cbd->part->mime_part->urls, url);
        }
 
@@ -3564,6 +3580,7 @@ void
 rspamd_url_text_extract (rspamd_mempool_t *pool,
                                                 struct rspamd_task *task,
                                                 struct rspamd_mime_text_part *part,
+                                                uint16_t *cur_url_order,
                                                 enum rspamd_url_find_type how)
 {
        struct rspamd_url_mimepart_cbdata mcbd;
@@ -3576,6 +3593,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
        mcbd.task = task;
        mcbd.part = part;
        mcbd.url_len = 0;
+       mcbd.cur_url_order = cur_url_order;
+       mcbd.cur_part_order = 0;
 
        rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
                        part->utf_stripped_content->len, how, part->newlines,
index 7a005efd85d2b378e7a7242f27aff85ab7eef859..f3d5617369d1a4e620558644c1f6bb124970bb7a 100644 (file)
@@ -82,6 +82,11 @@ struct rspamd_url {
        uint16_t count;
        uint16_t urllen;
        uint16_t rawlen;
+
+       /* Absolute order of the URL in a message */
+       uint16_t order;
+       /* Order of the URL in a specific part of message */
+       uint16_t part_order;
 };
 
 /**
@@ -156,6 +161,7 @@ void rspamd_url_deinit(void);
 void rspamd_url_text_extract(rspamd_mempool_t *pool,
                                                         struct rspamd_task *task,
                                                         struct rspamd_mime_text_part *part,
+                                                        uint16_t *cur_order,
                                                         enum rspamd_url_find_type how);
 
 /*