]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Css: Implement numbers and ident parsers
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 26 Jan 2021 14:44:01 +0000 (14:44 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 26 Jan 2021 14:44:01 +0000 (14:44 +0000)
src/libserver/css/css_tokeniser.cxx
src/libserver/css/css_tokeniser.hxx

index 058f7504e76bbe98bec05ddcfb46b005de62761c..f545af47a765ddf97b5172544556a2398931cd3b 100644 (file)
@@ -16,6 +16,9 @@
 
 #include "css_tokeniser.hxx"
 #include "css_util.hxx"
+#include "css.hxx"
+#include <charconv>
+#include <string>
 
 namespace rspamd::css {
 
@@ -35,6 +38,13 @@ auto make_token<css_parser_token::token_type::string_token, std::string_view>(co
        return css_parser_token{css_parser_token::token_type::string_token, s};
 }
 
+template<>
+auto make_token<css_parser_token::token_type::ident_token, std::string_view>(const std::string_view &s)
+-> css_parser_token
+{
+       return css_parser_token{css_parser_token::token_type::ident_token, s};
+}
+
 template<>
 auto make_token<css_parser_token::token_type::whitespace_token, std::string_view>(const std::string_view &s)
         -> css_parser_token
@@ -49,6 +59,13 @@ auto make_token<css_parser_token::token_type::delim_token, char>(const char &c)
        return css_parser_token{css_parser_token::token_type::delim_token, c};
 }
 
+template<>
+auto make_token<css_parser_token::token_type::number_token, double>(const double &d)
+-> css_parser_token
+{
+       return css_parser_token{css_parser_token::token_type::number_token, d};
+}
+
 /*
  * Generic tokens with no value (non-terminals)
  */
@@ -58,6 +75,287 @@ auto make_token(void) -> css_parser_token
        return css_parser_token{T, css_parser_token_placeholder()};
 }
 
+static constexpr inline auto is_plain_ident(char c) -> bool
+{
+       if ((c & 0x80) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
+               return true;
+       }
+
+       return false;
+};
+
+auto
+css_parser_token::adjust_dim(const css_parser_token &dim_token) -> bool
+{
+       if (!std::holds_alternative<double>(value) ||
+               !std::holds_alternative<std::string_view>(dim_token.value)) {
+               /* Invalid tokens */
+               return false;
+       }
+
+       auto num = std::get<double>(value);
+       auto sv = std::get<std::string_view>(dim_token.value);
+
+       if (sv == "px") {
+               dim_type = css_parser_token::dim_type::dim_px;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)num; /* Round to number */
+       }
+       else if (sv == "em") {
+               dim_type = css_parser_token::dim_type::dim_em;
+               flags |= css_parser_token::number_dimension;
+               /* EM is 16 px, so multiply and round */
+               num = (unsigned)(num * 16.0);
+       }
+       else if (sv == "rem") {
+               /* equal to EM in our case */
+               dim_type = css_parser_token::dim_type::dim_rem;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)(num * 16.0);
+       }
+       else if (sv == "ex") {
+               /*
+                * Represents the x-height of the element's font.
+                * On fonts with the "x" letter, this is generally the height
+                * of lowercase letters in the font; 1ex = 0.5em in many fonts.
+                */
+               dim_type = css_parser_token::dim_type::dim_ex;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)(num * 8.0);
+       }
+       else if (sv == "wv") {
+               /*
+                * Vewport width in percentages:
+                * we assume 1% of viewport width as 8px
+                */
+               dim_type = css_parser_token::dim_type::dim_wv;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)(num * 8.0);
+       }
+       else if (sv == "wh") {
+               /*
+                * Vewport height in percentages
+                * we assume 1% of viewport width as 6px
+                */
+               dim_type = css_parser_token::dim_type::dim_wh;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)(num * 6.0);
+       }
+       else if (sv == "vmax") {
+               /*
+                * Vewport width in percentages
+                * we assume 1% of viewport width as 6px
+                */
+               dim_type = css_parser_token::dim_type::dim_vmax;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)(num * 8.0);
+       }
+       else if (sv == "vmin") {
+               /*
+                * Vewport height in percentages
+                * we assume 1% of viewport width as 6px
+                */
+               dim_type = css_parser_token::dim_type::dim_vmin;
+               flags |= css_parser_token::number_dimension;
+               num = (unsigned)(num * 6.0);
+       }
+       else if (sv == "pt") {
+               dim_type = css_parser_token::dim_type::dim_pt;
+               flags |= css_parser_token::number_dimension;
+               num = (num * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */
+       }
+       else if (sv == "cm") {
+               dim_type = css_parser_token::dim_type::dim_cm;
+               flags |= css_parser_token::number_dimension;
+               num = (num * 96.0 / 2.54); /* 96px/2.54 */
+       }
+       else if (sv == "mm") {
+               dim_type = css_parser_token::dim_type::dim_mm;
+               flags |= css_parser_token::number_dimension;
+               num = (num * 9.6 / 2.54); /* 9.6px/2.54 */
+       }
+       else if (sv == "in") {
+               dim_type = css_parser_token::dim_type::dim_in;
+               flags |= css_parser_token::number_dimension;
+               num = (num * 96.0); /* 96px */
+       }
+       else if (sv == "pc") {
+               dim_type = css_parser_token::dim_type::dim_pc;
+               flags |= css_parser_token::number_dimension;
+               num = (num * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */
+       }
+       else {
+               flags |= css_parser_token::flag_bad_dimension;
+
+               return false;
+       }
+
+       value = num;
+
+       return true;
+}
+
+
+/*
+ * Consume functions: return a token and advance lexer offset
+ */
+auto css_tokeniser::consume_ident() -> struct css_parser_token
+{
+       auto i = offset;
+       auto need_escape = false;
+
+       /* Ident token can start from `-` or `--` */
+       if (input[i] == '-') {
+               i ++;
+
+               if (i < input.size() && input[i] == '-') {
+                       i ++;
+               }
+       }
+
+       while (i < input.size()) {
+               auto c = input[i];
+
+               if (!is_plain_ident(c)) {
+                       if (c == '\\' && i + 1 < input.size ()) {
+                               need_escape = true;
+                               auto nhex = 0;
+
+                               /* Need to find an escape end */
+                               do {
+                                       c = input[++i];
+                                       if (g_ascii_isxdigit(c)) {
+                                               nhex++;
+
+                                               if (nhex > 6) {
+                                                       /* End of the escape */
+                                                       break;
+                                               }
+                                       }
+                                       else if (nhex > 0 && c == ' ') {
+                                               /* \[hex]{1,6} */
+                                               i++; /* Skip one space */
+                                               break;
+                                       }
+                                       else {
+                                               /* Single \ + char */
+                                               break;
+                                       }
+                               } while (i < input.size ());
+                       }
+                       else {
+                               i --; /* Push token back */
+                               break; /* Not an ident token */
+                       }
+               } /* !plain ident */
+
+               i ++;
+       }
+
+       if (need_escape) {
+               auto escaped = rspamd::css::unescape_css(pool, {&input[offset], i - offset});
+               offset = i;
+
+               return make_token<css_parser_token::token_type::ident_token>(escaped);
+       }
+
+       auto result = std::string_view{&input[offset], i - offset};
+       offset = i;
+
+       return make_token<css_parser_token::token_type::ident_token>(result);
+}
+
+auto css_tokeniser::consume_number() -> struct css_parser_token
+{
+       auto i = offset;
+       auto seen_dot = false, seen_exp = false;
+
+       if (input[i] == '-') {
+               i ++;
+       }
+       if (input[i] == '.' && i < input.size()) {
+               seen_dot = true;
+               i ++;
+       }
+
+       while (i < input.size()) {
+               auto c = input[i];
+
+               if (!g_ascii_isdigit(c)) {
+                       if (c == '.') {
+                               if (!seen_dot) {
+                                       seen_dot = true;
+                               }
+                               else {
+                                       i --; /* Push back */
+                                       break;
+                               }
+                       }
+                       else if (c == 'e' || c == 'E') {
+                               if (!seen_exp) {
+                                       seen_exp = true;
+                                       seen_dot = true; /* dots are not allowed after e */
+
+                                       if (i + 1 < input.size()) {
+                                               auto next_c = input[i + 1];
+                                               if (next_c == '+' || next_c == '-') {
+                                                       i ++;
+                                               }
+                                       }
+                               }
+                               else {
+                                       i --; /* Push back */
+                                       break;
+                               }
+                       }
+               }
+
+               i ++;
+       }
+
+       if (i > offset) {
+               double num;
+
+               /* I wish it was supported properly */
+               //auto conv_res = std::from_chars(&input[offset], &input[i], num);
+               std::string numbuf{&input[offset], (i - offset)};
+               num = std::stod(numbuf);
+
+               auto ret = make_token<css_parser_token::token_type::number_token>(num);
+
+               if (i < input.size()) {
+                       if (input[i] == '%') {
+                               ret.flags |= css_parser_token::number_percent;
+                               i ++;
+                       }
+                       else if (is_plain_ident(input[i])) {
+                               auto dim_token = consume_ident();
+
+                               if (dim_token.type == css_parser_token::token_type::ident_token) {
+                                       if (!dim_token.adjust_dim(dim_token)) {
+                                               auto sv = std::get<std::string_view>(dim_token.value);
+                                               msg_debug_css("cannot apply dimension from the token %*s; number value = %.1f",
+                                                               (int)sv.size(), sv.begin(), num);
+                                       }
+                               }
+                       }
+               }
+
+               return ret;
+       }
+       else {
+               msg_err_css("internal error: invalid number, empty token");
+               i ++;
+       }
+
+       offset = i;
+       /* Should not happen */
+       return make_token<css_parser_token::token_type::delim_token>(input[i - 1]);
+}
+
+/*
+ * Main routine to produce lexer tokens
+ */
 auto css_tokeniser::next_token(void) -> struct css_parser_token
 {
        /* Helpers */
@@ -72,7 +370,7 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
                auto nested = 0;
 
                /* We handle nested comments just because they can exist... */
-               while (i < input.size () - 1) {
+               while (i < input.size() - 1) {
                        auto c = input[i];
                        if (c == '*' && input[i + 1] == '/') {
                                if (nested == 0) {
@@ -106,16 +404,15 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
                auto i = offset;
                bool need_unescape = false;
 
-               while (i < input.size ()) {
+               while (i < input.size()) {
                        auto c = input[i];
 
                        if (c == '\\') {
-                               if (i + 1 < input.size ()) {
+                               if (i + 1 < input.size()) {
                                        need_unescape = true;
                                }
                                else {
                                        /* \ at the end -> ignore */
-
                                }
                        }
                        else if (c == quote_char) {
@@ -133,6 +430,8 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
                        else if (c == '\n') {
                                /* Should be a error, but we ignore it for now */
                        }
+
+                       i ++;
                }
 
                /* EOF with no quote character, consider it fine */
@@ -148,15 +447,15 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
        };
 
        /* Main tokenisation loop */
-       for (auto i = offset; i < input.size (); ++i) {
+       for (auto i = offset; i < input.size(); ++i) {
                auto c = input[i];
 
                switch (c) {
                case '/':
-                       if (i + 1 < input.size () && input[i + 1] == '*') {
+                       if (i + 1 < input.size() && input[i + 1] == '*') {
                                offset = i + 2;
-                               consume_comment (); /* Consume comment and go forward */
-                               return next_token (); /* Tail call */
+                               consume_comment(); /* Consume comment and go forward */
+                               return next_token(); /* Tail call */
                        }
                        else {
                                offset = i + 1;
@@ -171,7 +470,7 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
                        /* Consume as much space as we can */
                        do {
                                c = input[++i];
-                       } while (i < input.size () && g_ascii_isspace (c));
+                       } while (i < input.size() && g_ascii_isspace(c));
 
                        auto ret = make_token<css_parser_token::token_type::whitespace_token>(
                                        std::string_view(&input[offset], i - offset));
@@ -188,8 +487,22 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
                case ')':
                        offset = i + 1;
                        return make_token<css_parser_token::token_type::ebrace_token>();
+               case '[':
+                       offset = i + 1;
+                       return make_token<css_parser_token::token_type::osqbrace_token>();
+               case ']':
+                       offset = i + 1;
+                       return make_token<css_parser_token::token_type::esqbrace_token>();
+               case '{':
+                       offset = i + 1;
+                       return make_token<css_parser_token::token_type::ocurlbrace_token>();
+               case '}':
+                       offset = i + 1;
+                       return make_token<css_parser_token::token_type::ecurlbrace_token>();
                case ',':
                        return make_token<css_parser_token::token_type::comma_token>();
+               case ';':
+                       return make_token<css_parser_token::token_type::semicolon_token>();
                case '<':
                        /* Maybe an xml like comment */
                        if (i + 3 < input.size () && input[i + 1] == '!'
@@ -202,6 +515,42 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token
                                offset = i + 1;
                                return make_token<css_parser_token::token_type::delim_token>(c);
                        }
+                       break;
+               case '-':
+                       if (i + 1 < input.size()) {
+                               auto next_c = input[i + 1];
+
+                               if (g_ascii_isdigit(next_c)) {
+                                       /* negative number */
+                                       return consume_number();
+                               }
+                               else if (next_c == '-') {
+                                       if (i + 2 < input.size() && input[i + 2] == '>') {
+                                               /* XML like comment */
+                                               return make_token<css_parser_token::token_type::cdc_token>();
+                                       }
+                               }
+                       }
+                       /* No other options, a delimiter - */
+                       offset = i + 1;
+                       return make_token<css_parser_token::token_type::delim_token>(c);
+
+                       break;
+               case '+':
+               case '.':
+                       /* Maybe number */
+                       if (i + 1 < input.size()) {
+                               auto next_c = input[i + 1];
+
+                               if (g_ascii_isdigit(next_c)) {
+                                       /* Numeric token */
+                                       return consume_number();
+                               }
+                       }
+                       /* No other options, a delimiter - */
+                       offset = i + 1;
+                       return make_token<css_parser_token::token_type::delim_token>(c);
+
                        break;
                }
 
index cff5877c23eb9b598da06a2e91592708e37c8b26..5880241c12f28d1371a11fb97c5b217d5f000497 100644 (file)
@@ -47,14 +47,36 @@ struct css_parser_token {
                ebrace_token, /* ) */
                osqbrace_token, /* [ */
                esqbrace_token, /* ] */
+               ocurlbrace_token, /* { */
+               ecurlbrace_token, /* } */
                comma_token,
                colon_token,
                semicolon_token,
                eof_token,
        };
 
+       enum class dim_type : std::uint8_t {
+               dim_px,
+               dim_em,
+               dim_rem,
+               dim_ex,
+               dim_wv,
+               dim_wh,
+               dim_vmax,
+               dim_vmin,
+               dim_pt,
+               dim_cm,
+               dim_mm,
+               dim_in,
+               dim_pc,
+       };
+
        static const std::uint8_t default_flags = 0;
        static const std::uint8_t flag_bad_string = (1u << 0u);
+       static const std::uint8_t number_dimension = (1u << 1u);
+       static const std::uint8_t number_percent = (1u << 2u);
+       static const std::uint8_t flag_bad_dimension = (1u << 3u);
+
        using value_type = std::variant<std::string_view, /* For strings and string like tokens */
                        char, /* For delimiters (might need to move to unicode point) */
                        double, /* For numeric stuff */
@@ -65,10 +87,12 @@ struct css_parser_token {
        value_type value;
        token_type type;
        std::uint8_t flags = default_flags;
+       dim_type dim_type;
 
        css_parser_token() = delete;
        explicit css_parser_token(token_type type, const value_type &value) :
                        value(value), type(type) {}
+       auto adjust_dim(const css_parser_token &dim_token) -> bool;
 };
 
 /* Ensure that parser tokens are simple enough */
@@ -86,6 +110,9 @@ private:
        std::string_view input;
        std::size_t offset;
        rspamd_mempool_t *pool;
+
+       auto consume_number() -> struct css_parser_token;
+       auto consume_ident() -> struct css_parser_token;
 };
 
 }