]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Html: Try to deal with bad unknown tags properly
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 5 Jul 2021 16:40:59 +0000 (17:40 +0100)
src/libserver/html/html.cxx
src/libserver/html/html.hxx
src/libserver/html/html_tag.hxx
src/libserver/html/html_tags.h

index d46d291d610892aad3e74bfe042b3b2b92e4205f..b29a7d37dbda79735b2d471d87d6fbcf0f4f1269 100644 (file)
@@ -354,7 +354,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
 
                                if (tag_def == nullptr) {
                                        hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS;
-                                       tag->id = N_TAGS;
+                                       /* Assign -hash to match closing tag if needed */
+                                       auto nhash = static_cast<std::int32_t>(std::hash<std::string_view>{}({s, nsize}));
+                                       /* Always negative */
+                                       tag->id = static_cast<tag_id_t>(nhash | G_MININT32);
                                }
                                else {
                                        tag->id = tag_def->id;
@@ -1040,7 +1043,7 @@ static auto
 html_append_tag_content(rspamd_mempool_t *pool,
                                                const gchar *start, gsize len,
                                                struct html_content *hc,
-                                               const html_tag *tag,
+                                               html_tag *tag,
                                                GList **exceptions,
                                                khash_t (rspamd_url_hash) *url_set) -> goffset
 {
@@ -1048,6 +1051,20 @@ html_append_tag_content(rspamd_mempool_t *pool,
        goffset next_tag_offset = tag->closing.end,
                        initial_dest_offset = hc->parsed.size();
 
+       if (tag->closing.end == -1) {
+               if (tag->closing.start != -1) {
+                       next_tag_offset = tag->closing.start;
+                       tag->closing.end = tag->closing.start;
+               }
+               else {
+                       next_tag_offset = len;
+                       tag->closing.end = len;
+               }
+       }
+       if (tag->closing.start == -1) {
+               tag->closing.start = tag->closing.end;
+       }
+
        auto append_margin = [&](char c) -> void {
                if (is_visible) {
                        if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
@@ -1072,7 +1089,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
 
                return tag->content_offset;
        }
-       else if (tag->id == Tag_HEAD || tag->id >= N_TAGS) {
+       else if (tag->id == Tag_HEAD) {
                return tag->closing.end;
        }
 
@@ -1235,7 +1252,7 @@ html_process_input(rspamd_mempool_t *pool,
        };
 
        auto process_opening_tag = [&]() {
-               if (cur_tag->id < N_TAGS) {
+               if (cur_tag->id > Tag_UNKNOWN) {
                        if (cur_tag->flags & CM_UNIQUE) {
                                if (!hc->tags_seen[cur_tag->id]) {
                                        /* Duplicate tag has been found */
@@ -1796,12 +1813,6 @@ html_process_input(rspamd_mempool_t *pool,
 
        /* Leftover after content */
        switch (state) {
-       case html_text_content:
-       case content_before_start:
-               if (p > c) {
-                       html_append_content(hc, {c, std::size_t(p - c)});
-               }
-               break;
        case tag_end_opening:
                if (cur_tag != nullptr) {
                        process_opening_tag();
@@ -1934,12 +1945,6 @@ TEST_CASE("html text extraction")
 {
 
        const std::vector<std::pair<std::string, std::string>> cases{
-                       {"</head>\n"
-                        "<body>\n"
-                        "<p> Hello. I have some bad news.\n"
-                        "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
-                        "</body>\n"
-                        "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
                        {"  <body>\n"
                         "    <!-- escape content -->\n"
                         "    a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;\n"
@@ -2003,6 +2008,7 @@ TEST_CASE("html text extraction")
                         "        <td>data2</td>\n"
                         "      </tr>\n"
                         "    </table>", "heada headb\ndata1 data2\n"},
+                        /* Invalid closing br and hr + comment */
                        {"  <body>\n"
                         "    <!-- page content -->\n"
                         "    Hello, world!<br>test</br><br>content</hr>more content<br>\n"
@@ -2010,6 +2016,16 @@ TEST_CASE("html text extraction")
                         "      content inside div\n"
                         "    </div>\n"
                         "  </body>", "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"},
+                        /* First closing tag */
+                       {"</head>\n"
+                        "<body>\n"
+                        "<p> Hello. I have some bad news.\n"
+                        "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
+                        "</body>\n"
+                        "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
+                       /* Invalid tags */
+                       {"lol <sht> omg </sht> oh my!\n"
+                        "<name>words words</name> goodbye","lol omg oh my! words words goodbye"},
        };
 
        rspamd_url_init(NULL);
index 67ef5a612b0397d5326d74ae07c05723f42862ab..5b5d0ddc06bebbe7e5764eb895c2ff945b7058b4 100644 (file)
@@ -51,7 +51,7 @@ struct html_content {
 
        /* Preallocate and reserve all internal structures */
        html_content() {
-               tags_seen.resize(N_TAGS, false);
+               tags_seen.resize(Tag_MAX, false);
                all_tags.reserve(128);
                parsed.reserve(256);
        }
index 17c15962a7a59d2e35dd281fca16a37562c6b19a..a221a48ffeadea219e3fbe859a58ddaacda07d9c 100644 (file)
@@ -85,7 +85,7 @@ struct html_tag {
        unsigned int tag_start = 0;
        unsigned int content_offset = 0;
        std::uint32_t flags = 0;
-       tag_id_t id = N_TAGS;
+       tag_id_t id = Tag_UNKNOWN;
        html_closing_tag closing;
 
        std::vector<html_tag_component> components;
@@ -116,7 +116,7 @@ struct html_tag {
        }
 
        auto clear(void) -> void {
-               id = N_TAGS;
+               id = Tag_UNKNOWN;
                tag_start = content_offset = 0;
                extra = std::monostate{};
                components.clear();
index 3f209c08ee9d7f6da07541cc1377a6cc2e1fdc08..e94dd6a9a545e1ed56bbd17e0ccdd266edb69348 100644 (file)
@@ -22,7 +22,7 @@ extern "C" {
 
 /* Known HTML tags */
 typedef enum {
-       Tag_UNKNOWN, /**< Unknown tag! */
+       Tag_UNKNOWN = 0, /**< Unknown tag! */
        Tag_A,      /**< A */
        Tag_ABBR,   /**< ABBR */
        Tag_ACRONYM, /**< ACRONYM */
@@ -143,8 +143,9 @@ typedef enum {
        Tag_XMP,    /**< XMP */
        Tag_XML,    /**< XML */
        Tag_NEXTID, /**< NEXTID */
+       Tag_MAX,
 
-       N_TAGS      /**< Must be last */
+       N_TAGS  = -1 /**< Must be -1 */
 } tag_id_t;
 
 #define CM_UNKNOWN      0