From: Vsevolod Stakhov Date: Tue, 26 Jan 2021 14:44:01 +0000 (+0000) Subject: [Project] Css: Implement numbers and ident parsers X-Git-Tag: 3.0~729 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=52cb3f8d019522aee0c049d091772d082a8502f1;p=thirdparty%2Frspamd.git [Project] Css: Implement numbers and ident parsers --- diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx index 058f7504e7..f545af47a7 100644 --- a/src/libserver/css/css_tokeniser.cxx +++ b/src/libserver/css/css_tokeniser.cxx @@ -16,6 +16,9 @@ #include "css_tokeniser.hxx" #include "css_util.hxx" +#include "css.hxx" +#include +#include namespace rspamd::css { @@ -35,6 +38,13 @@ auto make_token(co return css_parser_token{css_parser_token::token_type::string_token, s}; } +template<> +auto make_token(const std::string_view &s) +-> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::ident_token, s}; +} + template<> auto make_token(const std::string_view &s) -> css_parser_token @@ -49,6 +59,13 @@ auto make_token(const char &c) return css_parser_token{css_parser_token::token_type::delim_token, c}; } +template<> +auto make_token(const double &d) +-> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::number_token, d}; +} + /* * Generic tokens with no value (non-terminals) */ @@ -58,6 +75,287 @@ auto make_token(void) -> css_parser_token return css_parser_token{T, css_parser_token_placeholder()}; } +static constexpr inline auto is_plain_ident(char c) -> bool +{ + if ((c & 0x80) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') { + return true; + } + + return false; +}; + +auto +css_parser_token::adjust_dim(const css_parser_token &dim_token) -> bool +{ + if (!std::holds_alternative(value) || + !std::holds_alternative(dim_token.value)) { + /* Invalid tokens */ + return false; + } + + auto num = std::get(value); + auto sv = std::get(dim_token.value); + + if (sv == "px") { + dim_type = css_parser_token::dim_type::dim_px; + flags |= css_parser_token::number_dimension; + num = (unsigned)num; /* Round to number */ + } + else if (sv == "em") { + dim_type = css_parser_token::dim_type::dim_em; + flags |= css_parser_token::number_dimension; + /* EM is 16 px, so multiply and round */ + num = (unsigned)(num * 16.0); + } + else if (sv == "rem") { + /* equal to EM in our case */ + dim_type = css_parser_token::dim_type::dim_rem; + flags |= css_parser_token::number_dimension; + num = (unsigned)(num * 16.0); + } + else if (sv == "ex") { + /* + * Represents the x-height of the element's font. + * On fonts with the "x" letter, this is generally the height + * of lowercase letters in the font; 1ex = 0.5em in many fonts. + */ + dim_type = css_parser_token::dim_type::dim_ex; + flags |= css_parser_token::number_dimension; + num = (unsigned)(num * 8.0); + } + else if (sv == "wv") { + /* + * Vewport width in percentages: + * we assume 1% of viewport width as 8px + */ + dim_type = css_parser_token::dim_type::dim_wv; + flags |= css_parser_token::number_dimension; + num = (unsigned)(num * 8.0); + } + else if (sv == "wh") { + /* + * Vewport height in percentages + * we assume 1% of viewport width as 6px + */ + dim_type = css_parser_token::dim_type::dim_wh; + flags |= css_parser_token::number_dimension; + num = (unsigned)(num * 6.0); + } + else if (sv == "vmax") { + /* + * Vewport width in percentages + * we assume 1% of viewport width as 6px + */ + dim_type = css_parser_token::dim_type::dim_vmax; + flags |= css_parser_token::number_dimension; + num = (unsigned)(num * 8.0); + } + else if (sv == "vmin") { + /* + * Vewport height in percentages + * we assume 1% of viewport width as 6px + */ + dim_type = css_parser_token::dim_type::dim_vmin; + flags |= css_parser_token::number_dimension; + num = (unsigned)(num * 6.0); + } + else if (sv == "pt") { + dim_type = css_parser_token::dim_type::dim_pt; + flags |= css_parser_token::number_dimension; + num = (num * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */ + } + else if (sv == "cm") { + dim_type = css_parser_token::dim_type::dim_cm; + flags |= css_parser_token::number_dimension; + num = (num * 96.0 / 2.54); /* 96px/2.54 */ + } + else if (sv == "mm") { + dim_type = css_parser_token::dim_type::dim_mm; + flags |= css_parser_token::number_dimension; + num = (num * 9.6 / 2.54); /* 9.6px/2.54 */ + } + else if (sv == "in") { + dim_type = css_parser_token::dim_type::dim_in; + flags |= css_parser_token::number_dimension; + num = (num * 96.0); /* 96px */ + } + else if (sv == "pc") { + dim_type = css_parser_token::dim_type::dim_pc; + flags |= css_parser_token::number_dimension; + num = (num * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */ + } + else { + flags |= css_parser_token::flag_bad_dimension; + + return false; + } + + value = num; + + return true; +} + + +/* + * Consume functions: return a token and advance lexer offset + */ +auto css_tokeniser::consume_ident() -> struct css_parser_token +{ + auto i = offset; + auto need_escape = false; + + /* Ident token can start from `-` or `--` */ + if (input[i] == '-') { + i ++; + + if (i < input.size() && input[i] == '-') { + i ++; + } + } + + while (i < input.size()) { + auto c = input[i]; + + if (!is_plain_ident(c)) { + if (c == '\\' && i + 1 < input.size ()) { + need_escape = true; + auto nhex = 0; + + /* Need to find an escape end */ + do { + c = input[++i]; + if (g_ascii_isxdigit(c)) { + nhex++; + + if (nhex > 6) { + /* End of the escape */ + break; + } + } + else if (nhex > 0 && c == ' ') { + /* \[hex]{1,6} */ + i++; /* Skip one space */ + break; + } + else { + /* Single \ + char */ + break; + } + } while (i < input.size ()); + } + else { + i --; /* Push token back */ + break; /* Not an ident token */ + } + } /* !plain ident */ + + i ++; + } + + if (need_escape) { + auto escaped = rspamd::css::unescape_css(pool, {&input[offset], i - offset}); + offset = i; + + return make_token(escaped); + } + + auto result = std::string_view{&input[offset], i - offset}; + offset = i; + + return make_token(result); +} + +auto css_tokeniser::consume_number() -> struct css_parser_token +{ + auto i = offset; + auto seen_dot = false, seen_exp = false; + + if (input[i] == '-') { + i ++; + } + if (input[i] == '.' && i < input.size()) { + seen_dot = true; + i ++; + } + + while (i < input.size()) { + auto c = input[i]; + + if (!g_ascii_isdigit(c)) { + if (c == '.') { + if (!seen_dot) { + seen_dot = true; + } + else { + i --; /* Push back */ + break; + } + } + else if (c == 'e' || c == 'E') { + if (!seen_exp) { + seen_exp = true; + seen_dot = true; /* dots are not allowed after e */ + + if (i + 1 < input.size()) { + auto next_c = input[i + 1]; + if (next_c == '+' || next_c == '-') { + i ++; + } + } + } + else { + i --; /* Push back */ + break; + } + } + } + + i ++; + } + + if (i > offset) { + double num; + + /* I wish it was supported properly */ + //auto conv_res = std::from_chars(&input[offset], &input[i], num); + std::string numbuf{&input[offset], (i - offset)}; + num = std::stod(numbuf); + + auto ret = make_token(num); + + if (i < input.size()) { + if (input[i] == '%') { + ret.flags |= css_parser_token::number_percent; + i ++; + } + else if (is_plain_ident(input[i])) { + auto dim_token = consume_ident(); + + if (dim_token.type == css_parser_token::token_type::ident_token) { + if (!dim_token.adjust_dim(dim_token)) { + auto sv = std::get(dim_token.value); + msg_debug_css("cannot apply dimension from the token %*s; number value = %.1f", + (int)sv.size(), sv.begin(), num); + } + } + } + } + + return ret; + } + else { + msg_err_css("internal error: invalid number, empty token"); + i ++; + } + + offset = i; + /* Should not happen */ + return make_token(input[i - 1]); +} + +/* + * Main routine to produce lexer tokens + */ auto css_tokeniser::next_token(void) -> struct css_parser_token { /* Helpers */ @@ -72,7 +370,7 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token auto nested = 0; /* We handle nested comments just because they can exist... */ - while (i < input.size () - 1) { + while (i < input.size() - 1) { auto c = input[i]; if (c == '*' && input[i + 1] == '/') { if (nested == 0) { @@ -106,16 +404,15 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token auto i = offset; bool need_unescape = false; - while (i < input.size ()) { + while (i < input.size()) { auto c = input[i]; if (c == '\\') { - if (i + 1 < input.size ()) { + if (i + 1 < input.size()) { need_unescape = true; } else { /* \ at the end -> ignore */ - } } else if (c == quote_char) { @@ -133,6 +430,8 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token else if (c == '\n') { /* Should be a error, but we ignore it for now */ } + + i ++; } /* EOF with no quote character, consider it fine */ @@ -148,15 +447,15 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token }; /* Main tokenisation loop */ - for (auto i = offset; i < input.size (); ++i) { + for (auto i = offset; i < input.size(); ++i) { auto c = input[i]; switch (c) { case '/': - if (i + 1 < input.size () && input[i + 1] == '*') { + if (i + 1 < input.size() && input[i + 1] == '*') { offset = i + 2; - consume_comment (); /* Consume comment and go forward */ - return next_token (); /* Tail call */ + consume_comment(); /* Consume comment and go forward */ + return next_token(); /* Tail call */ } else { offset = i + 1; @@ -171,7 +470,7 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token /* Consume as much space as we can */ do { c = input[++i]; - } while (i < input.size () && g_ascii_isspace (c)); + } while (i < input.size() && g_ascii_isspace(c)); auto ret = make_token( std::string_view(&input[offset], i - offset)); @@ -188,8 +487,22 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token case ')': offset = i + 1; return make_token(); + case '[': + offset = i + 1; + return make_token(); + case ']': + offset = i + 1; + return make_token(); + case '{': + offset = i + 1; + return make_token(); + case '}': + offset = i + 1; + return make_token(); case ',': return make_token(); + case ';': + return make_token(); case '<': /* Maybe an xml like comment */ if (i + 3 < input.size () && input[i + 1] == '!' @@ -202,6 +515,42 @@ auto css_tokeniser::next_token(void) -> struct css_parser_token offset = i + 1; return make_token(c); } + break; + case '-': + if (i + 1 < input.size()) { + auto next_c = input[i + 1]; + + if (g_ascii_isdigit(next_c)) { + /* negative number */ + return consume_number(); + } + else if (next_c == '-') { + if (i + 2 < input.size() && input[i + 2] == '>') { + /* XML like comment */ + return make_token(); + } + } + } + /* No other options, a delimiter - */ + offset = i + 1; + return make_token(c); + + break; + case '+': + case '.': + /* Maybe number */ + if (i + 1 < input.size()) { + auto next_c = input[i + 1]; + + if (g_ascii_isdigit(next_c)) { + /* Numeric token */ + return consume_number(); + } + } + /* No other options, a delimiter - */ + offset = i + 1; + return make_token(c); + break; } diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx index cff5877c23..5880241c12 100644 --- a/src/libserver/css/css_tokeniser.hxx +++ b/src/libserver/css/css_tokeniser.hxx @@ -47,14 +47,36 @@ struct css_parser_token { ebrace_token, /* ) */ osqbrace_token, /* [ */ esqbrace_token, /* ] */ + ocurlbrace_token, /* { */ + ecurlbrace_token, /* } */ comma_token, colon_token, semicolon_token, eof_token, }; + enum class dim_type : std::uint8_t { + dim_px, + dim_em, + dim_rem, + dim_ex, + dim_wv, + dim_wh, + dim_vmax, + dim_vmin, + dim_pt, + dim_cm, + dim_mm, + dim_in, + dim_pc, + }; + static const std::uint8_t default_flags = 0; static const std::uint8_t flag_bad_string = (1u << 0u); + static const std::uint8_t number_dimension = (1u << 1u); + static const std::uint8_t number_percent = (1u << 2u); + static const std::uint8_t flag_bad_dimension = (1u << 3u); + using value_type = std::variant bool; }; /* Ensure that parser tokens are simple enough */ @@ -86,6 +110,9 @@ private: std::string_view input; std::size_t offset; rspamd_mempool_t *pool; + + auto consume_number() -> struct css_parser_token; + auto consume_ident() -> struct css_parser_token; }; }