[Rework] Html: Start rework of the html content structure

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 31 May 2021 18:51:45 +0000 (19:51 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 2 Jun 2021 19:55:09 +0000 (20:55 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 31 May 2021 18:51:45 +0000 (19:51 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 2 Jun 2021 19:55:09 +0000 (20:55 +0100)
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx

index 033ecdc22d5ac7e45c513a0082782d724394c589..9b0e02230e5d7c3a82f2fe432ead8274e475ff0a 100644 (file)
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -33,7 +33,7 @@ rspamd_css_dtor(void *p)
  }
  
  rspamd_css_ptr
-rspamd_css_parse_style(rspamd_mempool_t *pool, const guchar *begin, gsize len,
+rspamd_css_parse_style(rspamd_mempool_t *pool, const gchar *begin, gsize len,
                                            rspamd_css_ptr existing_style,
                                            GError **err)
  {
diff --git a/src/libserver/css/css.h b/src/libserver/css/css.h

index 1dabf00b83282c5d75aa9aa650cedb4d096b3ff9..607f1fa2c825477310a469c6b20855283cb46a4c 100644 (file)
--- a/src/libserver/css/css.h
+++ b/src/libserver/css/css.h
@@ -26,7 +26,7 @@ extern "C" {
  typedef void * rspamd_css_ptr;
  
  rspamd_css_ptr rspamd_css_parse_style (rspamd_mempool_t *pool,
-                                                                          const guchar *begin,
+                                                                          const gchar *begin,
                                                                            gsize len,
                                                                            rspamd_css_ptr existing_style,
                                                                            GError **err);
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index 2f124c65ff3979ca58986286b7906e49d0b76e02..97364979180443fa7358c717aaa41139566ec92b 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -19,6 +19,7 @@
  #include "message.h"
  #include "html.h"
  #include "html_tags.h"
+#include "html.hxx"
  #include "libserver/css/css_value.hxx"
  
  #include "url.h"
@@ -112,15 +113,6 @@ html_process_tag(rspamd_mempool_t *pool,
         GNode *nnode;
         struct html_tag *parent;
  
-       if (hc->html_tags == NULL) {
-               nnode = g_node_new(NULL);
-               *cur_level = nnode;
-               hc->html_tags = nnode;
-               rspamd_mempool_add_destructor (pool,
-                               (rspamd_mempool_destruct_t) g_node_destroy,
-                               nnode);
-       }
-
         if (hc->total_tags > rspamd::html::max_tags) {
                 hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
         }
@@ -131,6 +123,10 @@ html_process_tag(rspamd_mempool_t *pool,
                 return FALSE;
         }
  
+       if (*cur_level == nullptr) {
+               *cur_level = hc->html_tags;
+       }
+
         tag->parent = *cur_level;
  
         if (!(tag->flags & (CM_INLINE | CM_EMPTY))) {
@@ -819,8 +815,7 @@ html_process_img_tag(rspamd_mempool_t *pool,
                                          struct html_tag *tag,
                                          struct html_content *hc,
                                          khash_t (rspamd_url_hash) *url_set,
-                                        GPtrArray *part_urls,
-                                        GByteArray *dest)
+                                        GPtrArray *part_urls)
  {
         struct html_image *img;
  
@@ -1667,26 +1662,23 @@ tags_vector_ptr_dtor(void *ptr)
  
  static auto
  html_process_part_full (rspamd_mempool_t *pool,
-                                               struct html_content *hc,
                                                 GByteArray *in,
                                                 GList **exceptions,
                                                 khash_t (rspamd_url_hash) *url_set,
                                                 GPtrArray *part_urls,
-                                               bool allow_css) -> GByteArray*
+                                               bool allow_css) -> html_content *
  {
-       const guchar *p, *c, *end;
+       const gchar *p, *c, *end;
         guchar t;
         gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
                         balanced;
-       GByteArray *dest;
         guint obrace = 0, ebrace = 0;
         GNode *cur_level = NULL;
         struct rspamd_url *url = NULL;
         gint len, href_offset = -1;
         struct html_tag *cur_tag = NULL, *content_tag = NULL;
-       std::vector<struct html_block *> styles_blocks;
+       std::vector<html_block *> blocks_stack;
         struct tag_content_parser_state content_parser_env;
-       tags_vector *all_tags;
  
         enum {
                 parse_start = 0,
@@ -1707,25 +1699,12 @@ html_process_part_full (rspamd_mempool_t *pool,
         } state = parse_start;
  
         g_assert (in != NULL);
-       g_assert (hc != NULL);
         g_assert (pool != NULL);
  
-       all_tags = new tags_vector(128);
-       rspamd_mempool_add_destructor(pool, tags_vector_ptr_dtor, all_tags);
-
-       hc->tags_seen = (guchar *)rspamd_mempool_alloc0 (pool, NBYTES (N_TAGS));
+       struct html_content *hc = new html_content;
+       rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
  
-       /* Set white background color by default */
-       hc->bgcolor.d.comp.alpha = 0;
-       hc->bgcolor.d.comp.r = 255;
-       hc->bgcolor.d.comp.g = 255;
-       hc->bgcolor.d.comp.b = 255;
-       hc->bgcolor.valid = TRUE;
-
-       dest = g_byte_array_sized_new (in->len / 3 * 2);
-       styles_blocks.reserve(32);
-
-       p = in->data;
+       p = (const char *)in->data;
         c = p;
         end = p + in->len;
  
@@ -1772,8 +1751,8 @@ html_process_part_full (rspamd_mempool_t *pool,
                                 state = tag_content;
                                 content_parser_env.reset();
  
-                               all_tags->emplace_back(std::make_unique<html_tag>());
-                               cur_tag = all_tags->back().get();
+                               hc->all_tags.emplace_back(std::make_unique<html_tag>());
+                               cur_tag = hc->all_tags.back().get();
                                 break;
                         }
  
@@ -1904,7 +1883,7 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                         if (p > c) {
                                                 if (need_decode) {
-                                                       goffset old_offset = dest->len;
+                                                       goffset old_offset = hc->parsed.size();
  
                                                         if (content_tag) {
                                                                 if (content_tag->content_length == 0) {
@@ -1912,12 +1891,12 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                                 }
                                                         }
  
-                                                       g_byte_array_append (dest, c, (p - c));
+                                                       hc->parsed.append(c, p - c);
  
                                                         len = decode_html_entitles_inplace(
-                                                                       reinterpret_cast<gchar *>(dest->data + old_offset),
+                                                                       hc->parsed.data() + old_offset,
                                                                         (std::size_t)(p - c));
-                                                       dest->len = dest->len + len - (p - c);
+                                                       hc->parsed.resize(hc->parsed.size() + len - (p - c));
  
                                                         if (content_tag) {
                                                                 content_tag->content_length += len;
@@ -1928,13 +1907,13 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                                         if (content_tag) {
                                                                 if (content_tag->content_length == 0) {
-                                                                       content_tag->content_offset = dest->len;
+                                                                       content_tag->content_offset = hc->parsed.size();
                                                                 }
  
                                                                 content_tag->content_length += len;
                                                         }
  
-                                                       g_byte_array_append (dest, c, len);
+                                                       hc->parsed.append(c, len);
                                                 }
                                         }
  
@@ -1944,10 +1923,10 @@ html_process_part_full (rspamd_mempool_t *pool,
                                 else {
                                         if (save_space) {
                                                 /* Append one space if needed */
-                                               if (dest->len > 0 &&
-                                                       !g_ascii_isspace (dest->data[dest->len - 1])) {
-                                                       g_byte_array_append (dest,
-                                                                       reinterpret_cast<const guint8 *>(" "), 1);
+                                               if (!hc->parsed.empty() &&
+                                                       !g_ascii_isspace (hc->parsed.back())) {
+                                                       hc->parsed += " ";
+
                                                         if (content_tag) {
                                                                 if (content_tag->content_length == 0) {
                                                                         /*
@@ -1956,7 +1935,7 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                                          * we have no set content_offset
                                                                          * so we need to do it here
                                                                          */
-                                                                       content_tag->content_offset = dest->len;
+                                                                       content_tag->content_offset = hc->parsed.size();
                                                                 }
                                                                 else {
                                                                         content_tag->content_length++;
@@ -1971,19 +1950,19 @@ html_process_part_full (rspamd_mempool_t *pool,
                                 if (c != p) {
  
                                         if (need_decode) {
-                                               goffset old_offset = dest->len;
+                                               goffset old_offset = hc->parsed.size();
  
                                                 if (content_tag) {
                                                         if (content_tag->content_length == 0) {
-                                                               content_tag->content_offset = dest->len;
+                                                               content_tag->content_offset = hc->parsed.size();
                                                         }
                                                 }
  
-                                               g_byte_array_append (dest, c, (p - c));
-                                               len = decode_html_entitles_inplace (
-                                                               reinterpret_cast<gchar *>(dest->data + old_offset),
-                                                               p - c);
-                                               dest->len = dest->len + len - (p - c);
+                                               hc->parsed.append(c, p - c);
+                                               len = decode_html_entitles_inplace(
+                                                               hc->parsed.data() + old_offset,
+                                                               (std::size_t)(p - c));
+                                               hc->parsed.resize(hc->parsed.size() + len - (p - c));
  
                                                 if (content_tag) {
                                                         content_tag->content_length += len;
@@ -1994,13 +1973,13 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                                 if (content_tag) {
                                                         if (content_tag->content_length == 0) {
-                                                               content_tag->content_offset = dest->len;
+                                                               content_tag->content_offset = hc->parsed.size();
                                                         }
  
                                                         content_tag->content_length += len;
                                                 }
  
-                                               g_byte_array_append (dest, c, len);
+                                               hc->parsed.append(c, len);
                                         }
                                 }
  
@@ -2019,7 +1998,7 @@ html_process_part_full (rspamd_mempool_t *pool,
                          * We just search for the first </s substring and then pass
                          * the content to the parser (if needed)
                          */
-                       goffset end_style = rspamd_substring_search (reinterpret_cast<const gchar *>(p), end - p,
+                       goffset end_style = rspamd_substring_search (p, end - p,
                                         "</", 2);
                         if (end_style == -1 || g_ascii_tolower (p[end_style + 2]) != 's') {
                                 /* Invalid style */
@@ -2066,8 +2045,7 @@ html_process_part_full (rspamd_mempool_t *pool,
                         break;
  
                 case tag_content:
-                       parse_tag_content(pool, hc, cur_tag,
-                                       reinterpret_cast<const char *>(p), content_parser_env);
+                       parse_tag_content(pool, hc, cur_tag, p, content_parser_env);
                         if (t == '>') {
                                 if (closing) {
                                         cur_tag->flags |= FL_CLOSING;
@@ -2108,12 +2086,12 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                 if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
                                         if (cur_tag->flags & CM_UNIQUE) {
-                                               if (isset (hc->tags_seen, cur_tag->id)) {
+                                               if (!hc->tags_seen[cur_tag->id]) {
                                                         /* Duplicate tag has been found */
                                                         hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
                                                 }
                                         }
-                                       setbit (hc->tags_seen, cur_tag->id);
+                                       hc->tags_seen[cur_tag->id] = true;
                                 }
  
                                 if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
@@ -2122,9 +2100,10 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                 /* Handle newlines */
                                 if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
-                                       if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
-                                               g_byte_array_append (dest,
-                                                               reinterpret_cast<const guint8 *>("\r\n"), 2);
+                                       if (!hc->parsed.empty() &&
+                                               hc->parsed.back() != '\n') {
+
+                                               hc->parsed += "\r\n";
  
                                                 if (content_tag) {
                                                         if (content_tag->content_length == 0) {
@@ -2134,7 +2113,7 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                                  * we have no set content_offset
                                                                  * so we need to do it here
                                                                  */
-                                                               content_tag->content_offset = dest->len;
+                                                               content_tag->content_offset = hc->parsed.size();
                                                         }
                                                         else {
                                                                 content_tag->content_length += 2;
@@ -2147,8 +2126,10 @@ html_process_part_full (rspamd_mempool_t *pool,
                                 if ((cur_tag->id == Tag_P ||
                                          cur_tag->id == Tag_TR ||
                                          cur_tag->id == Tag_DIV)) {
-                                       if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
-                                               g_byte_array_append (dest, reinterpret_cast<const guint8 *>("\r\n"), 2);
+                                       if (!hc->parsed.empty() &&
+                                               hc->parsed.back() != '\n') {
+
+                                               hc->parsed += "\r\n";
  
                                                 if (content_tag) {
                                                         if (content_tag->content_length == 0) {
@@ -2158,7 +2139,7 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                                  * we have no set content_offset
                                                                  * so we need to get it here
                                                                  */
-                                                               content_tag->content_offset = dest->len;
+                                                               content_tag->content_offset = hc->parsed.size();
                                                         }
                                                         else {
                                                                 content_tag->content_length += 2;
@@ -2190,7 +2171,7 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                                 }
                                                         }
  
-                                                       href_offset = dest->len;
+                                                       href_offset = hc->parsed.size();
                                                 }
                                         }
  
@@ -2207,8 +2188,8 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                                 prev_url = std::get<rspamd_url *>(prev_tag->extra);
  
                                                                 std::string_view disp_part{
-                                                                               reinterpret_cast<const gchar *>(dest->data + href_offset),
-                                                                               dest->len - href_offset};
+                                                                               hc->parsed.data() + href_offset,
+                                                                               hc->parsed.size() - href_offset};
                                                                 html_check_displayed_url (pool,
                                                                                 exceptions, url_set,
                                                                                 disp_part,
@@ -2220,10 +2201,10 @@ html_process_part_full (rspamd_mempool_t *pool,
                                                 if (cur_tag->flags & (FL_CLOSING)) {
  
                                                         /* Insert exception */
-                                                       if (url != NULL && (gint) dest->len > href_offset) {
+                                                       if (url != NULL && hc->parsed.size() > href_offset) {
                                                                 std::string_view disp_part{
-                                                                       reinterpret_cast<const gchar *>(dest->data + href_offset),
-                                                                       dest->len - href_offset};
+                                                                               hc->parsed.data() + href_offset,
+                                                                               hc->parsed.size() - href_offset};
                                                                 html_check_displayed_url (pool,
                                                                                 exceptions, url_set,
                                                                                 disp_part,
@@ -2258,7 +2239,7 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                 if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
                                         html_process_img_tag(pool, cur_tag, hc, url_set,
-                                                       part_urls, dest);
+                                                       part_urls);
                                 }
                                 else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) {
                                         html_process_link_tag(pool, cur_tag, hc, url_set,
@@ -2269,8 +2250,8 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                         if (cur_tag->flags & FL_CLOSING) {
                                                 /* Just remove block element from the queue if any */
-                                               if (!styles_blocks.empty()) {
-                                                       styles_blocks.pop_back();
+                                               if (!blocks_stack.empty()) {
+                                                       blocks_stack.pop_back();
                                                 }
                                         }
                                         else {
@@ -2279,7 +2260,7 @@ html_process_part_full (rspamd_mempool_t *pool,
  
                                                 if (bl) {
                                                         html_propagate_style(hc, cur_tag,
-                                                                       bl, styles_blocks);
+                                                                       bl, blocks_stack);
  
                                                         /* Check visibility */
                                                         if (bl->font_size < 3 ||
@@ -2316,32 +2297,27 @@ html_process_part_full (rspamd_mempool_t *pool,
                                 html_propagate_lengths, NULL);
         }
  
-       hc->parsed = dest;
-
-       return dest;
+       return hc;
  }
  
  }
  
-GByteArray*
-rspamd_html_process_part_full (rspamd_mempool_t *pool,
-                                                          struct html_content *hc,
-                                                          GByteArray *in,
-                                                          GList **exceptions,
-                                                          khash_t (rspamd_url_hash) *url_set,
-                                                          GPtrArray *part_urls,
-                                                          bool allow_css)
+void *
+rspamd_html_process_part_full(rspamd_mempool_t *pool,
+                                                         GByteArray *in, GList **exceptions,
+                                                         khash_t (rspamd_url_hash) *url_set,
+                                                         GPtrArray *part_urls,
+                                                         bool allow_css)
  {
-       return rspamd::html::html_process_part_full(pool, hc, in, exceptions, url_set,
+       return rspamd::html::html_process_part_full(pool, in, exceptions, url_set,
                         part_urls, allow_css);
  }
  
-GByteArray*
-rspamd_html_process_part (rspamd_mempool_t *pool,
-               struct html_content *hc,
-               GByteArray *in)
+void *
+rspamd_html_process_part(rspamd_mempool_t *pool,
+                                                GByteArray *in)
  {
-       return rspamd_html_process_part_full (pool, hc, in, NULL,
+       return rspamd_html_process_part_full (pool, in, NULL,
                         NULL, NULL, FALSE);
  }
  
@@ -2369,7 +2345,6 @@ rspamd_html_tag_seen(struct html_content *hc, const gchar *tagname)
         gint id;
  
         g_assert (hc != NULL);
-       g_assert (hc->tags_seen != NULL);
  
         id = rspamd_html_tag_by_name(tagname);
  
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h

index 6106688f37ba6b004d514a3a9b54a55980a7843a..23faa47d38c2823523a62447f193d98455008a4f 100644 (file)
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -102,35 +102,21 @@ struct html_block {
  
  /* Forwarded declaration */
  struct rspamd_task;
-
-struct html_content {
-       struct rspamd_url *base_url;
-       GNode *html_tags;
-       gint flags;
-       guint total_tags;
-       struct html_color bgcolor;
-       guchar *tags_seen;
-       GPtrArray *images;
-       GPtrArray *blocks;
-       GByteArray *parsed;
-       void *css_style;
-};
+struct html_content;
  
  /*
   * Decode HTML entitles in text. Text is modified in place.
   */
  guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len);
  
-GByteArray *rspamd_html_process_part(rspamd_mempool_t *pool,
-                                                                         struct html_content *hc,
-                                                                         GByteArray *in);
+void* rspamd_html_process_part(rspamd_mempool_t *pool,
+                                                          GByteArray *in);
  
-GByteArray *rspamd_html_process_part_full(rspamd_mempool_t *pool,
-                                                                                  struct html_content *hc,
-                                                                                  GByteArray *in, GList **exceptions,
-                                                                                  khash_t (rspamd_url_hash) *url_set,
-                                                                                  GPtrArray *part_urls,
-                                                                                  bool allow_css);
+void *rspamd_html_process_part_full(rspamd_mempool_t *pool,
+                                                                       GByteArray *in, GList **exceptions,
+                                                                       khash_t (rspamd_url_hash) *url_set,
+                                                                       GPtrArray *part_urls,
+                                                                       bool allow_css);
  
  /*
   * Returns true if a specified tag has been seen in a part
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 31 May 2021 18:51:45 +0000 (19:51 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 2 Jun 2021 19:55:09 +0000 (20:55 +0100)
src/libserver/css/css.cxx		patch \| blob \| blame \| history
src/libserver/css/css.h		patch \| blob \| blame \| history
src/libserver/html/html.cxx		patch \| blob \| blame \| history
src/libserver/html/html.h		patch \| blob \| blame \| history