]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Html: Add rows display type support
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 1 Jul 2021 16:46:31 +0000 (17:46 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 1 Jul 2021 16:46:31 +0000 (17:46 +0100)
src/libserver/css/css_value.cxx
src/libserver/css/css_value.hxx
src/libserver/html/html.cxx

index 6988ea993d32bc370224d7e629900fb45f4b1d95..ec44b86a63085f1eb7bb496e7d1472b6517306eb 100644 (file)
@@ -310,14 +310,14 @@ constexpr const auto display_names_map = frozen::make_unordered_map<frozen::stri
                {"list-item",          css_display_value::DISPLAY_BLOCK},
                {"run-in",             css_display_value::DISPLAY_INLINE},
                {"table",              css_display_value::DISPLAY_BLOCK},
-               {"table-caption",      css_display_value::DISPLAY_BLOCK},
-               {"table-column-group", css_display_value::DISPLAY_BLOCK},
-               {"table-header-group", css_display_value::DISPLAY_BLOCK},
-               {"table-footer-group", css_display_value::DISPLAY_BLOCK},
-               {"table-row-group",    css_display_value::DISPLAY_BLOCK},
-               {"table-cell",         css_display_value::DISPLAY_BLOCK},
-               {"table-column",       css_display_value::DISPLAY_BLOCK},
-               {"table-row",          css_display_value::DISPLAY_BLOCK},
+               {"table-caption",      css_display_value::DISPLAY_TABLE_ROW},
+               {"table-column-group", css_display_value::DISPLAY_TABLE_ROW},
+               {"table-header-group", css_display_value::DISPLAY_TABLE_ROW},
+               {"table-footer-group", css_display_value::DISPLAY_TABLE_ROW},
+               {"table-row-group",    css_display_value::DISPLAY_TABLE_ROW},
+               {"table-cell",         css_display_value::DISPLAY_TABLE_ROW},
+               {"table-column",       css_display_value::DISPLAY_TABLE_ROW},
+               {"table-row",          css_display_value::DISPLAY_TABLE_ROW},
                {"initial",            css_display_value::DISPLAY_INLINE},
 });
 
@@ -364,6 +364,9 @@ auto css_value::debug_str() const -> std::string {
                        case css_display_value::DISPLAY_INLINE:
                                ret += "inline";
                                break;
+                       case css_display_value::DISPLAY_TABLE_ROW:
+                               ret += "table_row";
+                               break;
                        }
                }
                else if constexpr (std::is_integral_v<T>) {
index d3d06a5442851a54bd2a8ee9e5975ed067920184..8dcfa63da5e0685484a05a80e957a95b916caf3f 100644 (file)
@@ -75,6 +75,7 @@ struct css_dimension {
 enum class css_display_value : std::uint8_t {
        DISPLAY_INLINE,
        DISPLAY_BLOCK,
+       DISPLAY_TABLE_ROW,
        DISPLAY_HIDDEN
 };
 
index 08820228617e1b2446e5beab0336d55c0fec91bc..ae73b7413def9e6a18a5c1afa6497f3cb6ed43f8 100644 (file)
@@ -1041,10 +1041,26 @@ html_append_tag_content(rspamd_mempool_t *pool,
                                                GList **exceptions,
                                                khash_t (rspamd_url_hash) *url_set) -> goffset
 {
-       auto is_visible = true, is_block = false;
+       auto is_visible = true, is_block = false, is_spaces = false;
        goffset next_tag_offset = tag->closing.end,
                        initial_dest_offset = hc->parsed.size();
 
+       auto append_margin = [&](char c) -> void {
+               if (is_visible) {
+                       if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') {
+                               if (hc->parsed.back() == ' ') {
+                                       /* We also strip extra spaces at the end */
+                                       hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(),
+                                                       [](auto ch) -> auto {
+                                                               return ch != ' ';
+                                                       }).base(),
+                                                       hc->parsed.end());
+                               }
+                               hc->parsed.push_back(c);
+                       }
+               }
+       };
+
        if (tag->id == Tag_BR || tag->id == Tag_HR) {
                hc->parsed.append("\n");
 
@@ -1064,16 +1080,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
                else if (!tag->block->is_visible()) {
                        is_visible = false;
                }
-               else {
-                       is_block = tag->block->has_display() &&
-                                          tag->block->display == css::css_display_value::DISPLAY_BLOCK;
+               else if (tag->block->has_display()) {
+                       if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) {
+                               is_block = true;
+                       }
+                       else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) {
+                               is_spaces = true;
+                       }
                }
        }
 
        if (is_block) {
-               if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
-                       hc->parsed.append("\n");
-               }
+               append_margin('\n');
+       }
+       else if (is_spaces) {
+               append_margin(' ');
        }
 
        goffset cur_offset = tag->content_offset;
@@ -1104,11 +1125,11 @@ html_append_tag_content(rspamd_mempool_t *pool,
                                                                         std::size_t(final_part_len)});
                }
        }
-
-       if (is_block && is_visible) {
-               if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
-                       hc->parsed.append("\n");
-               }
+       if (is_block) {
+               append_margin('\n');
+       }
+       else if (is_spaces) {
+               append_margin(' ');
        }
 
        if (is_visible) {
@@ -1707,12 +1728,15 @@ html_process_input(rspamd_mempool_t *pool,
                if (tag->block) {
                        if (!tag->block->has_display()) {
                                /* If we have no display field, we can check it by tag */
-                               if (tag->flags & CM_BLOCK) {
+                               if (tag->flags & (CM_BLOCK|CM_TABLE)) {
                                        tag->block->set_display(css::css_display_value::DISPLAY_BLOCK);
                                }
                                else if (tag->flags & CM_HEAD) {
                                        tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN);
                                }
+                               else if (tag->flags & CM_ROW) {
+                                       tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW);
+                               }
                                else {
                                        tag->block->set_display(css::css_display_value::DISPLAY_INLINE);
                                }
@@ -1892,6 +1916,17 @@ TEST_CASE("html text extraction")
 {
 
        const std::vector<std::pair<std::string, std::string>> cases{
+                       /* Tables */
+                       {"<table>\n"
+                        "      <tr>\n"
+                        "        <th>heada</th>\n"
+                        "        <th>headb</th>\n"
+                        "      </tr>\n"
+                        "      <tr>\n"
+                        "        <td>data1</td>\n"
+                        "        <td>data2</td>\n"
+                        "      </tr>\n"
+                        "    </table>", "heada headb\ndata1 data2\n"},
                        /* XML tags */
                        {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
                         " <!DOCTYPE html\n"
@@ -1938,7 +1973,7 @@ TEST_CASE("html text extraction")
                         "    </P>\n"
                         "    <b>stuff</p>?\n"
                         "  </body>\n"
-                        "</html>", "Hello, world! test\ndata<> \nstuff?"},
+                        "</html>", "Hello, world! test\ndata<>\nstuff?"},
                        {"<p><!--comment-->test</br></hr><br>", "test\n"},
 
        };