From f2f16de4ab5f5c2ad58d67704ff040ed96058823 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 1 Jul 2021 17:46:31 +0100 Subject: [PATCH] [Project] Html: Add rows display type support --- src/libserver/css/css_value.cxx | 19 +++++----- src/libserver/css/css_value.hxx | 1 + src/libserver/html/html.cxx | 63 +++++++++++++++++++++++++-------- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/src/libserver/css/css_value.cxx b/src/libserver/css/css_value.cxx index 6988ea993d..ec44b86a63 100644 --- a/src/libserver/css/css_value.cxx +++ b/src/libserver/css/css_value.cxx @@ -310,14 +310,14 @@ constexpr const auto display_names_map = frozen::make_unordered_map std::string { case css_display_value::DISPLAY_INLINE: ret += "inline"; break; + case css_display_value::DISPLAY_TABLE_ROW: + ret += "table_row"; + break; } } else if constexpr (std::is_integral_v) { diff --git a/src/libserver/css/css_value.hxx b/src/libserver/css/css_value.hxx index d3d06a5442..8dcfa63da5 100644 --- a/src/libserver/css/css_value.hxx +++ b/src/libserver/css/css_value.hxx @@ -75,6 +75,7 @@ struct css_dimension { enum class css_display_value : std::uint8_t { DISPLAY_INLINE, DISPLAY_BLOCK, + DISPLAY_TABLE_ROW, DISPLAY_HIDDEN }; diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 0882022861..ae73b7413d 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1041,10 +1041,26 @@ html_append_tag_content(rspamd_mempool_t *pool, GList **exceptions, khash_t (rspamd_url_hash) *url_set) -> goffset { - auto is_visible = true, is_block = false; + auto is_visible = true, is_block = false, is_spaces = false; goffset next_tag_offset = tag->closing.end, initial_dest_offset = hc->parsed.size(); + auto append_margin = [&](char c) -> void { + if (is_visible) { + if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') { + if (hc->parsed.back() == ' ') { + /* We also strip extra spaces at the end */ + hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(), + [](auto ch) -> auto { + return ch != ' '; + }).base(), + hc->parsed.end()); + } + hc->parsed.push_back(c); + } + } + }; + if (tag->id == Tag_BR || tag->id == Tag_HR) { hc->parsed.append("\n"); @@ -1064,16 +1080,21 @@ html_append_tag_content(rspamd_mempool_t *pool, else if (!tag->block->is_visible()) { is_visible = false; } - else { - is_block = tag->block->has_display() && - tag->block->display == css::css_display_value::DISPLAY_BLOCK; + else if (tag->block->has_display()) { + if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) { + is_block = true; + } + else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) { + is_spaces = true; + } } } if (is_block) { - if (!hc->parsed.empty() && hc->parsed.back() != '\n') { - hc->parsed.append("\n"); - } + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); } goffset cur_offset = tag->content_offset; @@ -1104,11 +1125,11 @@ html_append_tag_content(rspamd_mempool_t *pool, std::size_t(final_part_len)}); } } - - if (is_block && is_visible) { - if (!hc->parsed.empty() && hc->parsed.back() != '\n') { - hc->parsed.append("\n"); - } + if (is_block) { + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); } if (is_visible) { @@ -1707,12 +1728,15 @@ html_process_input(rspamd_mempool_t *pool, if (tag->block) { if (!tag->block->has_display()) { /* If we have no display field, we can check it by tag */ - if (tag->flags & CM_BLOCK) { + if (tag->flags & (CM_BLOCK|CM_TABLE)) { tag->block->set_display(css::css_display_value::DISPLAY_BLOCK); } else if (tag->flags & CM_HEAD) { tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN); } + else if (tag->flags & CM_ROW) { + tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW); + } else { tag->block->set_display(css::css_display_value::DISPLAY_INLINE); } @@ -1892,6 +1916,17 @@ TEST_CASE("html text extraction") { const std::vector> cases{ + /* Tables */ + {"\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "
headaheadb
data1data2
", "heada headb\ndata1 data2\n"}, /* XML tags */ {"\n" " \n" " stuff

?\n" " \n" - "", "Hello, world! test\ndata<> \nstuff?"}, + "", "Hello, world! test\ndata<>\nstuff?"}, {"

test

", "test\n"}, }; -- 2.47.3