[Project] Html: Try to deal with bad unknown tags properly

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index d46d291d610892aad3e74bfe042b3b2b92e4205f..b29a7d37dbda79735b2d471d87d6fbcf0f4f1269 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -354,7 +354,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
  
                                 if (tag_def == nullptr) {
                                         hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
-                                       tag->id = N_TAGS;
+                                       /* Assign -hash to match closing tag if needed */
+                                       auto nhash = static_cast<std::int32_t>(std::hash<std::string_view>{}({s, nsize}));
+                                       /* Always negative */
+                                       tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
                                 }
                                 else {
                                         tag->id = tag_def->id;
@@ -1040,7 +1043,7 @@ static auto
  html_append_tag_content(rspamd_mempool_t *pool,
                                                 const gchar *start, gsize len,
                                                 struct html_content *hc,
-                                               const html_tag *tag,
+                                               html_tag *tag,
                                                 GList **exceptions,
                                                 khash_t (rspamd_url_hash) *url_set) -> goffset
  {
@@ -1048,6 +1051,20 @@ html_append_tag_content(rspamd_mempool_t *pool,
         goffset next_tag_offset = tag->closing.end,
                         initial_dest_offset = hc->parsed.size();
  
+       if (tag->closing.end == -1) {
+               if (tag->closing.start != -1) {
+                       next_tag_offset = tag->closing.start;
+                       tag->closing.end = tag->closing.start;
+               }
+               else {
+                       next_tag_offset = len;
+                       tag->closing.end = len;
+               }
+       }
+       if (tag->closing.start == -1) {
+               tag->closing.start = tag->closing.end;
+       }
+
         auto append_margin = [&](char c) -> void {
                 if (is_visible) {
                         if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
@@ -1072,7 +1089,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
  
                 return tag->content_offset;
         }
-       else if (tag->id == Tag_HEAD || tag->id >= N_TAGS) {
+       else if (tag->id == Tag_HEAD) {
                 return tag->closing.end;
         }
  
@@ -1235,7 +1252,7 @@ html_process_input(rspamd_mempool_t *pool,
         };
  
         auto process_opening_tag = [&]() {
-               if (cur_tag->id < N_TAGS) {
+               if (cur_tag->id > Tag_UNKNOWN) {
                         if (cur_tag->flags & CM_UNIQUE) {
                                 if (!hc->tags_seen[cur_tag->id]) {
                                         /* Duplicate tag has been found */
@@ -1796,12 +1813,6 @@ html_process_input(rspamd_mempool_t *pool,
  
         /* Leftover after content */
         switch (state) {
-       case html_text_content:
-       case content_before_start:
-               if (p > c) {
-                       html_append_content(hc, {c, std::size_t(p - c)});
-               }
-               break;
         case tag_end_opening:
                 if (cur_tag != nullptr) {
                         process_opening_tag();
@@ -1934,12 +1945,6 @@ TEST_CASE("html text extraction")
  {
  
         const std::vector<std::pair<std::string, std::string>> cases{
-                       {"</head>\n"
-                        "<body>\n"
-                        "<p> Hello. I have some bad news.\n"
-                        "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
-                        "</body>\n"
-                        "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
                         {"  <body>\n"
                          "    <!-- escape content -->\n"
                          "    a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;\n"
@@ -2003,6 +2008,7 @@ TEST_CASE("html text extraction")
                          "        <td>data2</td>\n"
                          "      </tr>\n"
                          "    </table>", "heada headb\ndata1 data2\n"},
+                        /* Invalid closing br and hr + comment */
                         {"  <body>\n"
                          "    <!-- page content -->\n"
                          "    Hello, world!<br>test</br><br>content</hr>more content<br>\n"
@@ -2010,6 +2016,16 @@ TEST_CASE("html text extraction")
                          "      content inside div\n"
                          "    </div>\n"
                          "  </body>", "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"},
+                        /* First closing tag */
+                       {"</head>\n"
+                        "<body>\n"
+                        "<p> Hello. I have some bad news.\n"
+                        "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
+                        "</body>\n"
+                        "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
+                       /* Invalid tags */
+                       {"lol <sht> omg </sht> oh my!\n"
+                        "<name>words words</name> goodbye","lol omg oh my! words words goodbye"},
         };
  
         rspamd_url_init(NULL);
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx

index 67ef5a612b0397d5326d74ae07c05723f42862ab..5b5d0ddc06bebbe7e5764eb895c2ff945b7058b4 100644 (file)
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -51,7 +51,7 @@ struct html_content {
  
         /* Preallocate and reserve all internal structures */
         html_content() {
-               tags_seen.resize(N_TAGS, false);
+               tags_seen.resize(Tag_MAX, false);
                 all_tags.reserve(128);
                 parsed.reserve(256);
         }
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx

index 17c15962a7a59d2e35dd281fca16a37562c6b19a..a221a48ffeadea219e3fbe859a58ddaacda07d9c 100644 (file)
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -85,7 +85,7 @@ struct html_tag {
         unsigned int tag_start = 0;
         unsigned int content_offset = 0;
         std::uint32_t flags = 0;
-       tag_id_t id = N_TAGS;
+       tag_id_t id = Tag_UNKNOWN;
         html_closing_tag closing;
  
         std::vector<html_tag_component> components;
@@ -116,7 +116,7 @@ struct html_tag {
         }
  
         auto clear(void) -> void {
-               id = N_TAGS;
+               id = Tag_UNKNOWN;
                 tag_start = content_offset = 0;
                 extra = std::monostate{};
                 components.clear();
diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h

index 3f209c08ee9d7f6da07541cc1377a6cc2e1fdc08..e94dd6a9a545e1ed56bbd17e0ccdd266edb69348 100644 (file)
--- a/src/libserver/html/html_tags.h
+++ b/src/libserver/html/html_tags.h
@@ -22,7 +22,7 @@ extern "C" {
  
  /* Known HTML tags */
  typedef enum {
-       Tag_UNKNOWN, /**< Unknown tag! */
+       Tag_UNKNOWN = 0, /**< Unknown tag! */
         Tag_A,      /**< A */
         Tag_ABBR,   /**< ABBR */
         Tag_ACRONYM, /**< ACRONYM */
@@ -143,8 +143,9 @@ typedef enum {
         Tag_XMP,    /**< XMP */
         Tag_XML,    /**< XML */
         Tag_NEXTID, /**< NEXTID */
+       Tag_MAX,
  
-       N_TAGS      /**< Must be last */
+       N_TAGS  = -1 /**< Must be -1 */
  } tag_id_t;
  
  #define CM_UNKNOWN      0
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
src/libserver/html/html.cxx		patch \| blob \| blame \| history
src/libserver/html/html.hxx		patch \| blob \| blame \| history
src/libserver/html/html_tag.hxx		patch \| blob \| blame \| history
src/libserver/html/html_tags.h		patch \| blob \| blame \| history