]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add html parsing limit
authorVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
src/libmime/message.c
src/libserver/cfg_file.h
src/libserver/cfg_rcl.c
src/libserver/cfg_utils.c
src/libserver/html/html.cxx
src/libserver/html/html.h

index ec49b3b5e21e1d6d117ca8fcf93612a4aa8a4b5c..ad2cccf929ad6656dc6da67ea7a59219f9ad9743 100644 (file)
@@ -766,7 +766,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
 
 
        text_part->html = rspamd_html_process_part_full (
-                       task->task_pool,
+                       task,
                        text_part->utf_raw_content,
                        &text_part->exceptions,
                        MESSAGE_FIELD (task, urls),
index 44502ebb78665694fb455857c5746ec373080c65..d7c3789e7e9e6f56ec03af9c2348a7e3e912c4c6 100644 (file)
@@ -478,6 +478,7 @@ struct rspamd_config {
        gint max_recipients;                           /**< maximum number of recipients to be processed        */
        guint max_blas_threads;                         /**< maximum threads for openblas when learning ANN             */
        guint max_opts_len;                             /**< maximum length for all options for a symbol                */
+       gsize max_html_len;                             /**< maximum length of HTML document                                    */
 
        struct module_s **compiled_modules;                /**< list of compiled C modules                                                      */
        struct worker_s **compiled_workers;                /**< list of compiled C modules                                                      */
index 01c2a6ad1a15b0a4002b3515ebc7f57b86285ca9..08d534eb3ee9df2573ff3058e57b980bdaf0ef5e 100644 (file)
@@ -1919,6 +1919,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
                                G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
                                RSPAMD_CL_FLAG_UINT,
                                "Maximum length of the word to be considered in statistics/fuzzy");
+               rspamd_rcl_add_default_handler (sub,
+                       "max_html_len",
+                       rspamd_rcl_parse_struct_integer,
+                       G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
+                       RSPAMD_CL_FLAG_INT_SIZE,
+                       "Maximum length of the html part to be parsed");
                rspamd_rcl_add_default_handler (sub,
                                "words_decay",
                                rspamd_rcl_parse_struct_integer,
index 09e2ab1588b50ac14da05aa1963b76d41f98557e..67bc9707017c150afcdd796519c2199a8c1a4c33 100644 (file)
@@ -75,6 +75,7 @@
 #define DEFAULT_MAX_SHOTS 100
 #define DEFAULT_MAX_SESSIONS 100
 #define DEFAULT_MAX_WORKERS 4
+#define DEFAULT_MAX_HTML_SIZE DEFAULT_MAX_MESSAGE / 5 /* 10 Mb */
 /* Timeout for task processing */
 #define DEFAULT_TASK_TIMEOUT 8.0
 #define DEFAULT_LUA_GC_STEP 200
@@ -243,6 +244,7 @@ rspamd_config_new (enum rspamd_config_init_flags flags)
        cfg->words_decay = DEFAULT_WORDS_DECAY;
        cfg->min_word_len = DEFAULT_MIN_WORD;
        cfg->max_word_len = DEFAULT_MAX_WORD;
+       cfg->max_html_len = DEFAULT_MAX_HTML_SIZE;
 
        /* GC limits */
        cfg->lua_gc_pause = DEFAULT_LUA_GC_PAUSE;
index e2f48480449046b9a177be0ff909cf8fd2c601d6..91a59c8d0a392b66d75f6a9d8f9c07ca95e23aaf 100644 (file)
@@ -22,6 +22,8 @@
 #include "html.hxx"
 #include "libserver/css/css_value.hxx"
 #include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
 
 #include "url.h"
 #include "contrib/libucl/khash.h"
@@ -1321,7 +1323,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 }
 
 auto
-html_process_input(rspamd_mempool_t *pool,
+html_process_input(struct rspamd_task *task,
                                   GByteArray *in,
                                   GList **exceptions,
                                   khash_t (rspamd_url_hash) *url_set,
@@ -1334,8 +1336,11 @@ html_process_input(rspamd_mempool_t *pool,
        guint obrace = 0, ebrace = 0;
        struct rspamd_url *url = nullptr;
        gint href_offset = -1;
+       auto overflow_input = false;
        struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
        struct tag_content_parser_state content_parser_env;
+       auto process_size = in->len;
+
 
        enum {
                parse_start = 0,
@@ -1364,10 +1369,20 @@ html_process_input(rspamd_mempool_t *pool,
        } html_document_state = html_document_state::doctype;
 
        g_assert (in != NULL);
-       g_assert (pool != NULL);
+       g_assert (task != NULL);
+
+       auto *pool = task->task_pool;
 
-       struct html_content *hc = new html_content;
-       rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
+       auto *hc = new html_content;
+       rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+       if (task->cfg && in->len > task->cfg->max_html_len) {
+               msg_notice_task("html input is too big: %z, limit is %z",
+                               in->len,
+                               task->cfg->max_html_len);
+               process_size = task->cfg->max_html_len;
+               overflow_input = true;
+       }
 
        auto new_tag = [&](int flags = 0) -> struct html_tag * {
 
@@ -1525,7 +1540,7 @@ html_process_input(rspamd_mempool_t *pool,
 
        p = (const char *) in->data;
        c = p;
-       end = p + in->len;
+       end = p + process_size;
        start = c;
 
        while (p < end) {
@@ -2140,8 +2155,17 @@ html_process_input(rspamd_mempool_t *pool,
                break;
        }
 
+       if (overflow_input) {
+               /*
+                * Append the rest of the input as raw html, this might work as
+                * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+                * It is still unclear about urls though...
+                */
+               hc->parsed.append(end, in->len - process_size);
+       }
+
        if (!hc->parsed.empty()) {
-               /* Trim extra spaces at the at the end if needed */
+               /* Trim extra spaces at the end if needed */
                if (g_ascii_isspace(hc->parsed.back())) {
                        auto last_it = std::end(hc->parsed);
 
@@ -2244,13 +2268,13 @@ html_tag::get_content(const struct html_content *hc) const -> std::string_view
 }
 
 void *
-rspamd_html_process_part_full(rspamd_mempool_t *pool,
+rspamd_html_process_part_full(struct rspamd_task *task,
                                                          GByteArray *in, GList **exceptions,
                                                          khash_t (rspamd_url_hash) *url_set,
                                                          GPtrArray *part_urls,
                                                          bool allow_css)
 {
-       return rspamd::html::html_process_input(pool, in, exceptions, url_set,
+       return rspamd::html::html_process_input(task, in, exceptions, url_set,
                        part_urls, allow_css);
 }
 
@@ -2258,7 +2282,11 @@ void *
 rspamd_html_process_part(rspamd_mempool_t *pool,
                                                 GByteArray *in)
 {
-       return rspamd_html_process_part_full (pool, in, NULL,
+       struct rspamd_task fake_task;
+       memset(&fake_task, 0, sizeof(fake_task));
+       fake_task.task_pool = pool;
+
+       return rspamd_html_process_part_full (&fake_task, in, NULL,
                        NULL, NULL, FALSE);
 }
 
index 8b690499e47b305ced496be1dee78ebfc2d44b93..2a43223f92d0cced68b2b05305edf8a99c1fafde 100644 (file)
@@ -70,7 +70,7 @@ guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
 void* rspamd_html_process_part(rspamd_mempool_t *pool,
                                                           GByteArray *in);
 
-void *rspamd_html_process_part_full(rspamd_mempool_t *pool,
+void *rspamd_html_process_part_full(struct rspamd_task *task,
                                                                        GByteArray *in, GList **exceptions,
                                                                        khash_t (rspamd_url_hash) *url_set,
                                                                        GPtrArray *part_urls,