[Feature] Add html parsing limit

author Vsevolod Stakhov <vsevolod@rspamd.com>

Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index ec49b3b5e21e1d6d117ca8fcf93612a4aa8a4b5c..ad2cccf929ad6656dc6da67ea7a59219f9ad9743 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -766,7 +766,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
  
  
         text_part->html = rspamd_html_process_part_full (
-                       task->task_pool,
+                       task,
                         text_part->utf_raw_content,
                         &text_part->exceptions,
                         MESSAGE_FIELD (task, urls),
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h

index 44502ebb78665694fb455857c5746ec373080c65..d7c3789e7e9e6f56ec03af9c2348a7e3e912c4c6 100644 (file)
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -478,6 +478,7 @@ struct rspamd_config {
         gint max_recipients;                           /**< maximum number of recipients to be processed        */
         guint max_blas_threads;                         /**< maximum threads for openblas when learning ANN             */
         guint max_opts_len;                             /**< maximum length for all options for a symbol                */
+       gsize max_html_len;                             /**< maximum length of HTML document                                    */
  
         struct module_s **compiled_modules;                /**< list of compiled C modules                                                      */
         struct worker_s **compiled_workers;                /**< list of compiled C modules                                                      */
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c

index 01c2a6ad1a15b0a4002b3515ebc7f57b86285ca9..08d534eb3ee9df2573ff3058e57b980bdaf0ef5e 100644 (file)
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1919,6 +1919,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
                                 G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
                                 RSPAMD_CL_FLAG_UINT,
                                 "Maximum length of the word to be considered in statistics/fuzzy");
+               rspamd_rcl_add_default_handler (sub,
+                       "max_html_len",
+                       rspamd_rcl_parse_struct_integer,
+                       G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
+                       RSPAMD_CL_FLAG_INT_SIZE,
+                       "Maximum length of the html part to be parsed");
                 rspamd_rcl_add_default_handler (sub,
                                 "words_decay",
                                 rspamd_rcl_parse_struct_integer,
diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c

index 09e2ab1588b50ac14da05aa1963b76d41f98557e..67bc9707017c150afcdd796519c2199a8c1a4c33 100644 (file)
--- a/src/libserver/cfg_utils.c
+++ b/src/libserver/cfg_utils.c
@@ -75,6 +75,7 @@
  #define DEFAULT_MAX_SHOTS 100
  #define DEFAULT_MAX_SESSIONS 100
  #define DEFAULT_MAX_WORKERS 4
+#define DEFAULT_MAX_HTML_SIZE DEFAULT_MAX_MESSAGE / 5 /* 10 Mb */
  /* Timeout for task processing */
  #define DEFAULT_TASK_TIMEOUT 8.0
  #define DEFAULT_LUA_GC_STEP 200
@@ -243,6 +244,7 @@ rspamd_config_new (enum rspamd_config_init_flags flags)
         cfg->words_decay = DEFAULT_WORDS_DECAY;
         cfg->min_word_len = DEFAULT_MIN_WORD;
         cfg->max_word_len = DEFAULT_MAX_WORD;
+       cfg->max_html_len = DEFAULT_MAX_HTML_SIZE;
  
         /* GC limits */
         cfg->lua_gc_pause = DEFAULT_LUA_GC_PAUSE;
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index e2f48480449046b9a177be0ff909cf8fd2c601d6..91a59c8d0a392b66d75f6a9d8f9c07ca95e23aaf 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -22,6 +22,8 @@
  #include "html.hxx"
  #include "libserver/css/css_value.hxx"
  #include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
  
  #include "url.h"
  #include "contrib/libucl/khash.h"
@@ -1321,7 +1323,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
  }
  
  auto
-html_process_input(rspamd_mempool_t *pool,
+html_process_input(struct rspamd_task *task,
                                    GByteArray *in,
                                    GList **exceptions,
                                    khash_t (rspamd_url_hash) *url_set,
@@ -1334,8 +1336,11 @@ html_process_input(rspamd_mempool_t *pool,
         guint obrace = 0, ebrace = 0;
         struct rspamd_url *url = nullptr;
         gint href_offset = -1;
+       auto overflow_input = false;
         struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
         struct tag_content_parser_state content_parser_env;
+       auto process_size = in->len;
+
  
         enum {
                 parse_start = 0,
@@ -1364,10 +1369,20 @@ html_process_input(rspamd_mempool_t *pool,
         } html_document_state = html_document_state::doctype;
  
         g_assert (in != NULL);
-       g_assert (pool != NULL);
+       g_assert (task != NULL);
+
+       auto *pool = task->task_pool;
  
-       struct html_content *hc = new html_content;
-       rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
+       auto *hc = new html_content;
+       rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+       if (task->cfg && in->len > task->cfg->max_html_len) {
+               msg_notice_task("html input is too big: %z, limit is %z",
+                               in->len,
+                               task->cfg->max_html_len);
+               process_size = task->cfg->max_html_len;
+               overflow_input = true;
+       }
  
         auto new_tag = [&](int flags = 0) -> struct html_tag * {
  
@@ -1525,7 +1540,7 @@ html_process_input(rspamd_mempool_t *pool,
  
         p = (const char *) in->data;
         c = p;
-       end = p + in->len;
+       end = p + process_size;
         start = c;
  
         while (p < end) {
@@ -2140,8 +2155,17 @@ html_process_input(rspamd_mempool_t *pool,
                 break;
         }
  
+       if (overflow_input) {
+               /*
+                * Append the rest of the input as raw html, this might work as
+                * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+                * It is still unclear about urls though...
+                */
+               hc->parsed.append(end, in->len - process_size);
+       }
+
         if (!hc->parsed.empty()) {
-               /* Trim extra spaces at the at the end if needed */
+               /* Trim extra spaces at the end if needed */
                 if (g_ascii_isspace(hc->parsed.back())) {
                         auto last_it = std::end(hc->parsed);
  
@@ -2244,13 +2268,13 @@ html_tag::get_content(const struct html_content *hc) const -> std::string_view
  }
  
  void *
-rspamd_html_process_part_full(rspamd_mempool_t *pool,
+rspamd_html_process_part_full(struct rspamd_task *task,
                                                           GByteArray *in, GList **exceptions,
                                                           khash_t (rspamd_url_hash) *url_set,
                                                           GPtrArray *part_urls,
                                                           bool allow_css)
  {
-       return rspamd::html::html_process_input(pool, in, exceptions, url_set,
+       return rspamd::html::html_process_input(task, in, exceptions, url_set,
                         part_urls, allow_css);
  }
  
@@ -2258,7 +2282,11 @@ void *
  rspamd_html_process_part(rspamd_mempool_t *pool,
                                                  GByteArray *in)
  {
-       return rspamd_html_process_part_full (pool, in, NULL,
+       struct rspamd_task fake_task;
+       memset(&fake_task, 0, sizeof(fake_task));
+       fake_task.task_pool = pool;
+
+       return rspamd_html_process_part_full (&fake_task, in, NULL,
                         NULL, NULL, FALSE);
  }
  
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h

index 8b690499e47b305ced496be1dee78ebfc2d44b93..2a43223f92d0cced68b2b05305edf8a99c1fafde 100644 (file)
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -70,7 +70,7 @@ guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
  void* rspamd_html_process_part(rspamd_mempool_t *pool,
                                                            GByteArray *in);
  
-void *rspamd_html_process_part_full(rspamd_mempool_t *pool,
+void *rspamd_html_process_part_full(struct rspamd_task *task,
                                                                         GByteArray *in, GList **exceptions,
                                                                         khash_t (rspamd_url_hash) *url_set,
                                                                         GPtrArray *part_urls,
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Wed, 26 Apr 2023 21:54:24 +0000 (22:54 +0100)
src/libmime/message.c		patch \| blob \| blame \| history
src/libserver/cfg_file.h		patch \| blob \| blame \| history
src/libserver/cfg_rcl.c		patch \| blob \| blame \| history
src/libserver/cfg_utils.c		patch \| blob \| blame \| history
src/libserver/html/html.cxx		patch \| blob \| blame \| history
src/libserver/html/html.h		patch \| blob \| blame \| history