From: Vsevolod Stakhov Date: Tue, 13 Jul 2021 13:57:07 +0000 (+0100) Subject: [Project] Html: Implement rawtext state machine X-Git-Tag: 3.0~168 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4a51df3cc2e822a1401137698adc94bfa49d229a;p=thirdparty%2Frspamd.git [Project] Html: Implement rawtext state machine --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 5d2479ab4d..bde7c01178 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1223,7 +1223,8 @@ html_process_input(rspamd_mempool_t *pool, tag_end_closing, html_text_content, xml_tag_end, - content_style, + tag_raw_text, + tag_raw_text_less_than, tags_limit_overflow, } state = parse_start; @@ -1643,44 +1644,24 @@ html_process_input(rspamd_mempool_t *pool, } break; - case content_style: { - - /* - * We just search for the first substring and then pass - * the content to the parser (if needed) - * - * TODO: Handle other stuff, we actually need an FSM here to find - * the ending tag... - */ - auto end_style = rspamd_substring_search_caseless(p, end - p, - "", 8); - if (end_style == -1) { - /* Invalid style */ - state = html_text_content; + case tag_raw_text: + if (t == '<') { + c = p; + state = tag_raw_text_less_than; } - else { - - if (allow_css) { - auto ret_maybe = rspamd::css::parse_css(pool, {p, std::size_t(end_style)}, - std::move(hc->css_style)); - - if (!ret_maybe.has_value()) { - auto err_str = fmt::format("cannot parse css (error code: {}): {}", - static_cast(ret_maybe.error().type), - ret_maybe.error().description.value_or("unknown error")); - msg_info_pool ("cannot parse css: %*s", - (int) err_str.size(), err_str.data()); - } - else { - hc->css_style = ret_maybe.value(); - } - } - - p += end_style; + p ++; + break; + case tag_raw_text_less_than: + if (t == '/') { + /* Shift back */ + p = c; state = tag_begin; } + else { + p ++; + state = tag_raw_text; + } break; - } case sgml_content: /* TODO: parse DOCTYPE here */ if (t == '>') { @@ -1719,8 +1700,8 @@ html_process_input(rspamd_mempool_t *pool, state = html_text_content; if (cur_tag) { - if (cur_tag->id == Tag_STYLE) { - state = content_style; + if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) { + state = tag_raw_text; } if (html_document_state == html_document_state::doctype) { if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) { @@ -1806,6 +1787,25 @@ html_process_input(rspamd_mempool_t *pool, parent_tag = cur_tag->parent; g_assert(cur_tag->parent != &cur_closing_tag); } + + if (cur_tag->id == Tag_STYLE && cur_tag->closing.start > cur_tag->content_offset) { + if (allow_css) { + auto ret_maybe = rspamd::css::parse_css(pool, + {start + cur_tag->content_offset, cur_tag->closing.start - cur_tag->content_offset}, + std::move(hc->css_style)); + + if (!ret_maybe.has_value()) { + auto err_str = fmt::format("cannot parse css (error code: {}): {}", + static_cast(ret_maybe.error().type), + ret_maybe.error().description.value_or("unknown error")); + msg_info_pool ("cannot parse css: %*s", + (int) err_str.size(), err_str.data()); + } + else { + hc->css_style = ret_maybe.value(); + } + } + } } /* if cur_tag != nullptr */ state = html_text_content; p++;