G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
RSPAMD_CL_FLAG_UINT,
"Maximum length of the word to be considered in statistics/fuzzy");
+ rspamd_rcl_add_default_handler (sub,
+ "max_html_len",
+ rspamd_rcl_parse_struct_integer,
+ G_STRUCT_OFFSET (struct rspamd_config, max_word_len),
+ RSPAMD_CL_FLAG_INT_SIZE,
+ "Maximum length of the html part to be parsed");
rspamd_rcl_add_default_handler (sub,
"words_decay",
rspamd_rcl_parse_struct_integer,
#include "html.hxx"
#include "libserver/css/css_value.hxx"
#include "libserver/css/css.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
#include "url.h"
#include "contrib/libucl/khash.h"
}
auto
-html_process_input(rspamd_mempool_t *pool,
+html_process_input(struct rspamd_task *task,
GByteArray *in,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
guint obrace = 0, ebrace = 0;
struct rspamd_url *url = nullptr;
gint href_offset = -1;
+ auto overflow_input = false;
struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag;
struct tag_content_parser_state content_parser_env;
+ auto process_size = in->len;
+
enum {
parse_start = 0,
} html_document_state = html_document_state::doctype;
g_assert (in != NULL);
- g_assert (pool != NULL);
+ g_assert (task != NULL);
+
+ auto *pool = task->task_pool;
- struct html_content *hc = new html_content;
- rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
+ auto *hc = new html_content;
+ rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+
+ if (task->cfg && in->len > task->cfg->max_html_len) {
+ msg_notice_task("html input is too big: %z, limit is %z",
+ in->len,
+ task->cfg->max_html_len);
+ process_size = task->cfg->max_html_len;
+ overflow_input = true;
+ }
auto new_tag = [&](int flags = 0) -> struct html_tag * {
p = (const char *) in->data;
c = p;
- end = p + in->len;
+ end = p + process_size;
start = c;
while (p < end) {
break;
}
+ if (overflow_input) {
+ /*
+ * Append the rest of the input as raw html, this might work as
+ * further algorithms can skip words when auto *pool = task->task_pool;there are too many.
+ * It is still unclear about urls though...
+ */
+ hc->parsed.append(end, in->len - process_size);
+ }
+
if (!hc->parsed.empty()) {
- /* Trim extra spaces at the at the end if needed */
+ /* Trim extra spaces at the end if needed */
if (g_ascii_isspace(hc->parsed.back())) {
auto last_it = std::end(hc->parsed);
}
void *
-rspamd_html_process_part_full(rspamd_mempool_t *pool,
+rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
bool allow_css)
{
- return rspamd::html::html_process_input(pool, in, exceptions, url_set,
+ return rspamd::html::html_process_input(task, in, exceptions, url_set,
part_urls, allow_css);
}
rspamd_html_process_part(rspamd_mempool_t *pool,
GByteArray *in)
{
- return rspamd_html_process_part_full (pool, in, NULL,
+ struct rspamd_task fake_task;
+ memset(&fake_task, 0, sizeof(fake_task));
+ fake_task.task_pool = pool;
+
+ return rspamd_html_process_part_full (&fake_task, in, NULL,
NULL, NULL, FALSE);
}