From: Vsevolod Stakhov Date: Fri, 2 Jul 2021 10:33:52 +0000 (+0100) Subject: [Minor] Html: Fix some more mess with bad closing tags X-Git-Tag: 3.0~221 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=570007aa24ad1125ebc4b6ab98d383cf9b16ca3e;p=thirdparty%2Frspamd.git [Minor] Html: Fix some more mess with bad closing tags --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 82c5d213c3..967411b2ae 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1062,7 +1062,10 @@ html_append_tag_content(rspamd_mempool_t *pool, }; if (tag->id == Tag_BR || tag->id == Tag_HR) { - hc->parsed.append("\n"); + + if (!(tag->flags & FL_IGNORE)) { + hc->parsed.append("\n"); + } return tag->content_offset; } @@ -1331,6 +1334,10 @@ html_process_input(rspamd_mempool_t *pool, if (!(cur_tag->flags & CM_EMPTY)) { html_process_block_tag(pool, cur_tag, hc); } + else { + /* Implicitly close */ + cur_tag->flags |= FL_CLOSED; + } if (cur_tag->flags & FL_CLOSED) { cur_tag->closing.end = cur_tag->content_offset; @@ -1660,6 +1667,11 @@ html_process_input(rspamd_mempool_t *pool, break; case tag_end_closing: { if (cur_tag) { + + if (cur_tag->flags & CM_EMPTY) { + /* Ignore closing empty tags */ + cur_tag->flags |= FL_IGNORE; + } /* cur_tag here is a closing tag */ auto *next_cur_tag = html_check_balance(hc, cur_tag, c - start, p - start + 1); @@ -1687,7 +1699,7 @@ html_process_input(rspamd_mempool_t *pool, auto &&vtag = std::make_unique(); vtag->id = cur_tag->id; - vtag->flags = FL_VIRTUAL | FL_CLOSED; + vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags; vtag->tag_start = cur_tag->closing.start; vtag->content_offset = p - start + 1; vtag->closing = cur_tag->closing; @@ -1918,6 +1930,13 @@ TEST_CASE("html text extraction") { const std::vector> cases{ + {" \n" + " \n" + " Hello, world!
test

contentmore content
\n" + "
\n" + " content inside div\n" + "
\n" + " ", "Hello, world!\ntest\ncontent\nmore content\ncontent inside div\n"}, {" \n" " \n" " a b a > b a < b a & b 'a "a"\n" @@ -1981,6 +2000,7 @@ TEST_CASE("html text extraction") " data2\n" " \n" " ", "heada headb\ndata1 data2\n"}, + }; rspamd_url_init(NULL);