]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Save invisible content to a separate buffer
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 7 Sep 2021 14:00:38 +0000 (15:00 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 7 Sep 2021 14:01:30 +0000 (15:01 +0100)
src/libserver/html/html.cxx
src/libserver/html/html.hxx
src/libserver/html/html_tag.hxx
src/lua/lua_html.cxx

index e4cc137b43ddc2da9f4badb505e99d066cf20f8b..97009749f3691c64af6c1ef9c4632ca76ca47c56 100644 (file)
@@ -985,12 +985,15 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 }
 
 static inline auto
-html_append_parsed(struct html_content *hc, std::string_view data, bool transparent,
-               std::size_t input_len) -> std::size_t
+html_append_parsed(struct html_content *hc,
+                                  std::string_view data,
+                                  bool transparent,
+                                  std::size_t input_len,
+                                  std::string &dest) -> std::size_t
 {
-       auto cur_offset = hc->parsed.size();
+       auto cur_offset = dest.size();
 
-       if (hc->parsed.size() > input_len) {
+       if (dest.size() > input_len) {
                /* Impossible case, refuse to append */
                return 0;
        }
@@ -999,9 +1002,9 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar
                /* Handle multiple spaces at the begin */
 
                if (cur_offset > 0) {
-                       auto last = hc->parsed.back();
+                       auto last = dest.back();
                        if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) {
-                               hc->parsed.append(" ");
+                               dest.append(" ");
                                data = {data.data() + 1, data.size() - 1};
                                cur_offset++;
                        }
@@ -1020,24 +1023,24 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar
                                }
                        };
 
-                       hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD"));
-                       replace_zero_func(data, hc->parsed);
+                       dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD"));
+                       replace_zero_func(data, dest);
                        hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
                }
                else {
-                       hc->parsed.append(data);
+                       dest.append(data);
                }
        }
 
-       auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset,
-                       hc->parsed.size() - cur_offset, true);
+       auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset,
+                       dest.size() - cur_offset, true);
 
-       hc->parsed.resize(nlen + cur_offset);
+       dest.resize(nlen + cur_offset);
 
        if (transparent) {
                /* Replace all visible characters with spaces */
-               auto start = std::next(hc->parsed.begin(), cur_offset);
-               std::replace_if(start, std::end(hc->parsed), [](const auto c) {
+               auto start = std::next(dest.begin(), cur_offset);
+               std::replace_if(start, std::end(dest), [](const auto c) {
                        return !g_ascii_isspace(c);
                }, ' ');
        }
@@ -1076,11 +1079,18 @@ html_append_tag_content(rspamd_mempool_t *pool,
 {
        auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false;
        goffset next_tag_offset = tag->closing.end,
-                       initial_dest_offset = hc->parsed.size();
+                       initial_parsed_offset = hc->parsed.size(),
+                       initial_invisible_offset = hc->invisible.size();
 
-       auto calculate_final_tag_offsets = [&tag, initial_dest_offset, hc]() -> void {
-               tag->content_offset = initial_dest_offset;
-               tag->closing.start = hc->parsed.size();
+       auto calculate_final_tag_offsets = [&]() -> void {
+               if (is_visible) {
+                       tag->content_offset = initial_parsed_offset;
+                       tag->closing.start = hc->parsed.size();
+               }
+               else {
+                       tag->content_offset = initial_invisible_offset;
+                       tag->closing.start = hc->invisible.size();
+               }
        };
 
        if (tag->closing.end == -1) {
@@ -1098,17 +1108,18 @@ html_append_tag_content(rspamd_mempool_t *pool,
        }
 
        auto append_margin = [&](char c) -> void {
+               /* We do care about visible margins only */
                if (is_visible) {
                        if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
                                if (hc->parsed.back() == ' ') {
                                        /* We also strip extra spaces at the end, but limiting the start */
-                                       auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_dest_offset);
+                                       auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset);
                                        auto first = std::find_if(hc->parsed.rbegin(), last,
                                                        [](auto ch) -> auto {
                                                                return ch != ' ';
                                                        });
                                        hc->parsed.erase(first.base(), hc->parsed.end());
-                                       g_assert(hc->parsed.size() >= initial_dest_offset);
+                                       g_assert(hc->parsed.size() >= initial_parsed_offset);
                                }
                                hc->parsed.push_back(c);
                        }
@@ -1177,10 +1188,17 @@ html_append_tag_content(rspamd_mempool_t *pool,
                auto enclosed_start = cld->tag_start;
                goffset initial_part_len = enclosed_start - cur_offset;
 
-               if (is_visible && initial_part_len > 0) {
-                       html_append_parsed(hc,
-                                       {start + cur_offset, std::size_t(initial_part_len)},
-                                       is_transparent, len);
+               if (initial_part_len > 0) {
+                       if (is_visible) {
+                               html_append_parsed(hc,
+                                               {start + cur_offset, std::size_t(initial_part_len)},
+                                               is_transparent, len, hc->parsed);
+                       }
+                       else {
+                               html_append_parsed(hc,
+                                               {start + cur_offset, std::size_t(initial_part_len)},
+                                               is_transparent, len, hc->invisible);
+                       }
                }
 
                auto next_offset = html_append_tag_content(pool, start, len,
@@ -1195,11 +1213,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
        if (cur_offset < tag->closing.start) {
                goffset final_part_len = tag->closing.start - cur_offset;
 
-               if (is_visible && final_part_len > 0) {
-                       html_append_parsed(hc,
-                                       {start + cur_offset, std::size_t(final_part_len)},
-                                        is_transparent,
-                                        len);
+               if (final_part_len > 0) {
+                       if (is_visible) {
+                               html_append_parsed(hc,
+                                               {start + cur_offset, std::size_t(final_part_len)},
+                                               is_transparent,
+                                               len,
+                                               hc->parsed);
+                       }
+                       else {
+                               html_append_parsed(hc,
+                                               {start + cur_offset, std::size_t(final_part_len)},
+                                               is_transparent,
+                                               len,
+                                               hc->invisible);
+                       }
                }
        }
        if (is_block) {
@@ -1211,11 +1239,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
 
        if (is_visible) {
                if (tag->id == Tag_A) {
-                       auto written_len = hc->parsed.size() - initial_dest_offset;
+                       auto written_len = hc->parsed.size() - initial_parsed_offset;
                        html_process_displayed_href_tag(pool, hc,
-                                       {hc->parsed.data() + initial_dest_offset, written_len},
+                                       {hc->parsed.data() + initial_parsed_offset, written_len},
                                        tag, exceptions,
-                                       url_set, initial_dest_offset);
+                                       url_set, initial_parsed_offset);
                }
                else if (tag->id == Tag_IMG) {
                        /* Process ALT if presented */
@@ -1997,7 +2025,7 @@ html_process_input(rspamd_mempool_t *pool,
                break;
        case tags_limit_overflow:
                html_append_parsed(hc, {c, (std::size_t) (end - c)},
-                               false, end - start);
+                               false, end - start, hc->parsed);
                break;
        default:
                /* Do nothing */
@@ -2084,6 +2112,27 @@ auto html_tag_by_name(const std::string_view &name)
        return std::nullopt;
 }
 
+auto
+html_tag::get_content(const struct html_content *hc) const -> std::string_view
+{
+       const std::string *dest = &hc->parsed;
+
+       if (block && !block->is_visible()) {
+               dest = &hc->invisible;
+       }
+       const auto clen = get_content_length();
+       if (content_offset < dest->size()) {
+               if (dest->size() - content_offset >= clen) {
+                       return std::string_view{*dest}.substr(content_offset, clen);
+               }
+               else {
+                       return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset);
+               }
+       }
+
+       return std::string_view{};
+}
+
 }
 
 void *
index 7e63bedce633a3ab2bc55ab5abfa220062fe8697..5c16d085ac0e0fa167021eade02433406c2692b0 100644 (file)
@@ -47,6 +47,7 @@ struct html_content {
        std::vector<html_image *> images;
        std::vector<std::unique_ptr<struct html_tag>> all_tags;
        std::string parsed;
+       std::string invisible;
        std::shared_ptr<css::css_style_sheet> css_style;
 
        /* Preallocate and reserve all internal structures */
index b6fc73120dba1c29b0dd3296fe7cfb362026b8fb..5971ca179b0007f0b6789c5b083307405f49b7ac 100644 (file)
@@ -31,6 +31,8 @@ struct html_image;
 
 namespace rspamd::html {
 
+struct html_content; /* Forward declaration */
+
 enum class html_component_type : std::uint8_t {
        RSPAMD_HTML_COMPONENT_NAME = 0,
        RSPAMD_HTML_COMPONENT_HREF,
@@ -141,19 +143,7 @@ struct html_tag {
                return 0;
        }
 
-       constexpr auto get_content(std::string_view parsed) const -> std::string_view {
-               const auto clen = get_content_length();
-               if (content_offset < parsed.size()) {
-                       if (parsed.size() - content_offset >= clen) {
-                               return parsed.substr(content_offset, clen);
-                       }
-                       else {
-                               return parsed.substr(content_offset, parsed.size() - content_offset);
-                       }
-               }
-
-               return std::string_view{};
-       }
+       auto get_content(const struct html_content *hc) const -> std::string_view;
 };
 
 static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY);
index 848bfbdf3819506cd95d8c82ee7340efed3fd15c..8767d1a117f7d12d3a61f6852380302081d7af71 100644 (file)
@@ -448,7 +448,7 @@ lua_html_foreach_tag (lua_State *L)
                                auto *ltag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(lua_html_tag)));
                                ltag->tag = tag;
                                ltag->html = hc;
-                               auto ct = ltag->tag->get_content(hc->parsed);
+                               auto ct = ltag->tag->get_content(hc);
                                rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
                                lua_pushinteger (L, ct.size());
 
@@ -582,7 +582,7 @@ lua_html_tag_get_content (lua_State *L)
        if (ltag) {
 
                if (ltag->html) {
-                       auto ct = ltag->tag->get_content(ltag->html->parsed);
+                       auto ct = ltag->tag->get_content(ltag->html);
                        if (ct.size() > 0) {
                                t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
                                rspamd_lua_setclass(L, "rspamd{text}", -1);
@@ -613,7 +613,7 @@ lua_html_tag_get_content_length (lua_State *L)
 
        if (ltag) {
                if (ltag->html) {
-                       auto ct = ltag->tag->get_content(ltag->html->parsed);
+                       auto ct = ltag->tag->get_content(ltag->html);
                        lua_pushinteger (L, ct.size());
                }
                else {