From: Vsevolod Stakhov Date: Wed, 16 Jul 2025 11:31:26 +0000 (+0100) Subject: [Project] Rework system of html tags to allow more tag types X-Git-Tag: 3.13.0~43^2~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=12f0ef55dcadeb9d82484288d6dfb4a7b22cccfa;p=thirdparty%2Frspamd.git [Project] Rework system of html tags to allow more tag types --- diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx index 1b369ed172..c53e3c05e5 100644 --- a/src/libserver/css/css.cxx +++ b/src/libserver/css/css.cxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr &&selector auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block * { - std::optional id_comp, class_comp; rspamd::html::html_block *res = nullptr; if (!tag) { @@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa } /* First, find id in a tag and a class */ - for (const auto ¶m: tag->components) { - if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) { - id_comp = param.value; - } - else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - class_comp = param.value; - } - } + auto id_comp = tag->find_id(); + auto class_comp = tag->find_class(); /* ID part */ if (id_comp && !pimpl->id_selectors.empty()) { @@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool, return std::make_pair(nullptr, parse_res.error()); } -}// namespace rspamd::css \ No newline at end of file +}// namespace rspamd::css diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 93d1fdf91b..5597b7eb57 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -199,15 +199,44 @@ html_check_balance(struct html_content *hc, return nullptr; } -auto html_component_from_string(const std::string_view &st) -> std::optional +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component { - auto known_component_it = html_components_map.find(st); + auto known_component_it = html_components_map.find(name); if (known_component_it != html_components_map.end()) { - return known_component_it->second; + switch (known_component_it->second) { + case html_component_type::RSPAMD_HTML_COMPONENT_NAME: + return html_component_name{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_HREF: + return html_component_href{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_COLOR: + return html_component_color{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR: + return html_component_bgcolor{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_STYLE: + return html_component_style{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_CLASS: + return html_component_class{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_WIDTH: + return html_component_width{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT: + return html_component_height{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_SIZE: + return html_component_size{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_REL: + return html_component_rel{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_ALT: + return html_component_alt{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_ID: + return html_component_id{value}; + case html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN: + return html_component_hidden{}; + default: + return html_component_unknown{name, value}; + } } else { - return std::nullopt; + return html_component_unknown{name, value}; } } @@ -234,13 +263,13 @@ enum tag_parser_state { struct tag_content_parser_state { tag_parser_state cur_state = parse_start; std::string buf; - std::optional cur_component; + std::string attr_name;// Store current attribute name void reset() { cur_state = parse_start; buf.clear(); - cur_component = std::nullopt; + attr_name.clear(); } }; @@ -254,56 +283,50 @@ html_parse_tag_content(rspamd_mempool_t *pool, auto state = parser_env.cur_state; /* - * Stores tag component if it doesn't exist, performing copy of the - * value + decoding of the entities - * Parser env is set to clear the current html attribute fields (saved_p and - * cur_component) + * Stores tag component creating the appropriate variant type + * Parser env is cleared after storing */ auto store_component_value = [&]() -> void { - if (parser_env.cur_component) { + if (!parser_env.attr_name.empty()) { + std::string_view attr_name_view, value_view; - if (parser_env.buf.empty()) { - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{}); + // Store attribute name in persistent memory + if (!parser_env.attr_name.empty()) { + auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size()); + memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size()); + attr_name_view = {name_storage, parser_env.attr_name.size()}; } - else { - /* We need to copy buf to a persistent storage */ - auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); - if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID || - parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - /* Lowercase */ - rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size()); + // Store value in persistent memory if not empty + if (!parser_env.buf.empty()) { + auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); + + // Lowercase for id and class attributes + if (parser_env.attr_name == "id" || parser_env.attr_name == "class") { + rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size()); } else { - memcpy(s, parser_env.buf.data(), parser_env.buf.size()); + memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size()); } - auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size()); - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{s, sz}); + auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size()); + value_view = {value_storage, sz}; } + + // Create the appropriate component variant + auto component = html_component_from_string(attr_name_view, value_view); + tag->components.emplace_back(std::move(component)); } parser_env.buf.clear(); - parser_env.cur_component = std::nullopt; + parser_env.attr_name.clear(); }; auto store_component_name = [&]() -> bool { decode_html_entitles_inplace(parser_env.buf); - auto known_component_it = html_components_map.find(std::string_view{parser_env.buf}); + parser_env.attr_name = parser_env.buf; parser_env.buf.clear(); - - if (known_component_it != html_components_map.end()) { - parser_env.cur_component = known_component_it->second; - - return true; - } - else { - parser_env.cur_component = std::nullopt; - } - - return false; + return true; }; auto store_value_character = [&](bool lc) -> void { @@ -620,7 +643,7 @@ html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) -> std::optional { - auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + auto found_href_maybe = tag->find_href(); if (found_href_maybe) { /* Check base url */ @@ -816,130 +839,117 @@ html_process_img_tag(rspamd_mempool_t *pool, img = rspamd_mempool_alloc0_type(pool, struct html_image); img->tag = tag; - for (const auto ¶m: tag->components) { + // Process HREF component + if (auto href_value = tag->find_href()) { + if (href_value->size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value->data(); + fstr.len = href_value->size(); + img->src = rspamd_mempool_ftokdup(pool, &fstr); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { - /* Check base url */ - const auto &href_value = param.value; - - if (href_value.size() > 0) { - rspamd_ftok_t fstr; - fstr.begin = href_value.data(); - fstr.len = href_value.size(); - img->src = rspamd_mempool_ftokdup(pool, &fstr); - - if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), - "cid:", sizeof("cid:") - 1) == 0) { - /* We have an embedded image */ - img->src += sizeof("cid:") - 1; - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; + if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->src += sizeof("cid:") - 1; + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; + } + else { + if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, *href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; } else { - if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), - "data:", sizeof("data:") - 1) == 0) { - /* We have an embedded image in HTML tag */ - img->flags |= - (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); - html_process_data_image(pool, img, href_value); - hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; - } - else { - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; - if (img->src) { - - std::string_view cpy{href_value}; - auto maybe_url = html_process_url(pool, cpy); - - if (maybe_url) { - img->url = maybe_url.value(); - struct rspamd_url *existing; - - img->url->flags |= RSPAMD_URL_FLAG_IMAGE; - existing = rspamd_url_set_add_or_return(url_set, - img->url); - - if (existing && existing != img->url) { - /* - * We have some other URL that could be - * found, e.g. from another part. However, - * we still want to set an image flag on it - */ - existing->flags |= img->url->flags; - existing->count++; - } - else if (part_urls) { - /* New url */ - g_ptr_array_add(part_urls, img->url); - } + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { + + std::string_view cpy{*href_value}; + auto maybe_url = html_process_url(pool, cpy); + + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; + + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, + img->url); + + if (existing && existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); } } } } } } + } + // Process numeric dimensions using the new helper methods + if (auto height = tag->find_height()) { + img->height = height.value(); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { - unsigned long val; - - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->height = val; - } - - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { - unsigned long val; - - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->width = val; - } - - /* TODO: rework to css at some time */ - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - if (img->height == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "height", sizeof("height") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("height") - 1); + if (auto width = tag->find_width()) { + img->width = width.value(); + } - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->height = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + // Process style component for dimensions + if (auto style_value = tag->find_style()) { + if (img->height == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } - if (img->width == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "width", sizeof("width") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("width") - 1); - - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->width = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + } + if (img->width == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } @@ -968,7 +978,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, khash_t(rspamd_url_hash) * url_set, GPtrArray *part_urls) -> void { - auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); + auto found_rel_maybe = tag->find_rel(); if (found_rel_maybe) { if (found_rel_maybe.value() == "icon") { @@ -984,24 +994,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, std::optional maybe_fgcolor, maybe_bgcolor; bool hidden = false; - for (const auto ¶m: tag->components) { - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { - maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); - } - - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { - maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); - } + // Process color components + if (auto color_comp = tag->find_component()) { + maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - tag->block = rspamd::css::parse_css_declaration(pool, param.value); - } + if (auto bgcolor_comp = tag->find_component()) { + maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) { - hidden = true; - } + // Process style component + if (auto style_value = tag->find_style()) { + tag->block = rspamd::css::parse_css_declaration(pool, *style_value); } + // Check if hidden + hidden = tag->is_hidden(); + if (!tag->block) { tag->block = html_block::undefined_html_block_pool(pool); } @@ -1284,7 +1293,7 @@ html_append_tag_content(rspamd_mempool_t *pool, } else if (tag->id == Tag_IMG) { /* Process ALT if presented */ - auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT); + auto maybe_alt = tag->find_alt(); if (maybe_alt) { if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { @@ -1384,9 +1393,7 @@ auto html_process_input(struct rspamd_task *task, overflow_input = true; } - auto new_tag = [&](int flags = 0) -> struct html_tag * - { - + auto new_tag = [&](int flags = 0) -> struct html_tag * { if (hc->all_tags.size() > rspamd::html::max_tags) { hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; @@ -2151,7 +2158,7 @@ auto html_process_input(struct rspamd_task *task, /* Leftover after content */ switch (state) { case tags_limit_overflow: - html_append_parsed(hc, {c, (std::size_t)(end - c)}, + html_append_parsed(hc, {c, (std::size_t) (end - c)}, false, end - start, hc->parsed); break; default: @@ -2390,4 +2397,4 @@ gsize rspamd_html_get_tags_count(void *html_content) } return hc->all_tags.size(); -} \ No newline at end of file +} diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 309d76177e..a6b366a913 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,6 +26,7 @@ #include #include "html_tags.h" +#include "libutil/str_util.h" struct rspamd_url; struct html_image; @@ -34,6 +35,7 @@ namespace rspamd::html { struct html_content; /* Forward declaration */ +// Internal enum for mapping (not exposed in public API) enum class html_component_type : std::uint8_t { RSPAMD_HTML_COMPONENT_NAME = 0, RSPAMD_HTML_COMPONENT_HREF, @@ -50,6 +52,269 @@ enum class html_component_type : std::uint8_t { RSPAMD_HTML_COMPONENT_HIDDEN, }; +// Forward declarations for component types +struct html_component_name; +struct html_component_href; +struct html_component_color; +struct html_component_bgcolor; +struct html_component_style; +struct html_component_class; +struct html_component_width; +struct html_component_height; +struct html_component_size; +struct html_component_rel; +struct html_component_alt; +struct html_component_id; +struct html_component_hidden; +struct html_component_unknown; + +// Base interface for all components +struct html_component_base { + virtual ~html_component_base() = default; + virtual std::string_view get_string_value() const = 0; +}; + +// String-based components +struct html_component_name : html_component_base { + std::string_view value; + explicit html_component_name(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_href : html_component_base { + std::string_view value; + explicit html_component_href(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_style : html_component_base { + std::string_view value; + explicit html_component_style(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_class : html_component_base { + std::string_view value; + explicit html_component_class(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_rel : html_component_base { + std::string_view value; + explicit html_component_rel(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_alt : html_component_base { + std::string_view value; + explicit html_component_alt(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_id : html_component_base { + std::string_view value; + explicit html_component_id(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Color components (could be extended to parse actual colors) +struct html_component_color : html_component_base { + std::string_view value; + explicit html_component_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_bgcolor : html_component_base { + std::string_view value; + explicit html_component_bgcolor(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Numeric components +struct html_component_width : html_component_base { + std::string_view raw_value; + std::optional numeric_value; + + explicit html_component_width(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_height : html_component_base { + std::string_view raw_value; + std::optional numeric_value; + + explicit html_component_height(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_size : html_component_base { + std::string_view raw_value; + std::optional numeric_value; + + explicit html_component_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional get_numeric_value() const + { + return numeric_value; + } +}; + +// Boolean/flag component +struct html_component_hidden : html_component_base { + bool present; + explicit html_component_hidden() + : present(true) + { + } + std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + bool is_present() const + { + return present; + } +}; + +// Unknown component with both name and value +struct html_component_unknown : html_component_base { + std::string_view name; + std::string_view value; + + html_component_unknown(std::string_view n, std::string_view v) + : name(n), value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } + std::string_view get_name() const + { + return name; + } +}; + +// The variant type that holds all possible components +using html_tag_component = std::variant< + html_component_name, + html_component_href, + html_component_color, + html_component_bgcolor, + html_component_style, + html_component_class, + html_component_width, + html_component_height, + html_component_size, + html_component_rel, + html_component_alt, + html_component_id, + html_component_hidden, + html_component_unknown>; + +/** + * Returns component variant from a string + * @param name attribute name + * @param value attribute value + * @return variant component + */ +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component; + /* Public tags flags */ /* XML tag */ #define FL_XML (1u << CM_USER_SHIFT) @@ -62,23 +327,7 @@ enum class html_component_type : std::uint8_t { #define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) #define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) -/** - * Returns component type from a string - * @param st - * @return - */ -auto html_component_from_string(const std::string_view &st) -> std::optional; - using html_tag_extra_t = std::variant; -struct html_tag_component { - html_component_type type; - std::string_view value; - - html_tag_component(html_component_type type, std::string_view value) - : type(type), value(value) - { - } -}; /* Pairing closing tag representation */ struct html_closing_tag { @@ -105,26 +354,174 @@ struct html_tag { std::vector children; struct html_tag *parent; - auto find_component(html_component_type what) const -> std::optional + // Template method to find component by type + template + auto find_component() const -> std::optional { for (const auto &comp: components) { - if (comp.type == what) { - return comp.value; + if (std::holds_alternative(comp)) { + return &std::get(comp); } } + return std::nullopt; + } + // Helper methods for common component access + auto find_href() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->value; + } return std::nullopt; } - auto find_component(std::optional what) const -> std::optional + auto find_class() const -> std::optional { - if (what) { - return find_component(what.value()); + if (auto comp = find_component()) { + return comp.value()->value; } + return std::nullopt; + } + + auto find_id() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->value; + } + return std::nullopt; + } + auto find_width() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->get_numeric_value(); + } + return std::nullopt; + } + + auto find_height() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->get_numeric_value(); + } + return std::nullopt; + } + + auto find_style() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->value; + } return std::nullopt; } + auto find_alt() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_rel() const -> std::optional + { + if (auto comp = find_component()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto is_hidden() const -> bool + { + return find_component().has_value(); + } + + auto find_unknown_component(std::string_view attr_name) const -> std::optional + { + for (const auto &comp: components) { + if (std::holds_alternative(comp)) { + const auto &unknown = std::get(comp); + if (unknown.name == attr_name) { + return unknown.value; + } + } + } + return std::nullopt; + } + + auto get_unknown_components() const -> std::vector> + { + std::vector> unknown_attrs; + for (const auto &comp: components) { + if (std::holds_alternative(comp)) { + const auto &unknown = std::get(comp); + unknown_attrs.emplace_back(unknown.name, unknown.value); + } + } + return unknown_attrs; + } + + // Generic visitor method for processing all components + template + auto visit_components(Visitor &&visitor) const + { + for (const auto &comp: components) { + std::visit(std::forward(visitor), comp); + } + } + + // Find any component by attribute name (for Lua bindings and generic access) + auto find_component_by_name(std::string_view attr_name) const -> std::optional + { + // Check known component types first using their helper methods + if (attr_name == "href") return find_href(); + if (attr_name == "class") return find_class(); + if (attr_name == "id") return find_id(); + if (attr_name == "style") return find_style(); + if (attr_name == "alt") return find_alt(); + if (attr_name == "rel") return find_rel(); + if (attr_name == "hidden") return is_hidden() ? std::optional{"true"} : std::nullopt; + + // Handle numeric components that need string conversion + if (attr_name == "width") { + if (auto comp = find_component()) { + return comp.value()->get_string_value(); + } + } + if (attr_name == "height") { + if (auto comp = find_component()) { + return comp.value()->get_string_value(); + } + } + if (attr_name == "size") { + if (auto comp = find_component()) { + return comp.value()->get_string_value(); + } + } + + // Handle color components + if (attr_name == "color") { + if (auto comp = find_component()) { + return comp.value()->value; + } + } + if (attr_name == "bgcolor") { + if (auto comp = find_component()) { + return comp.value()->value; + } + } + + // Handle name component + if (attr_name == "name") { + if (auto comp = find_component()) { + return comp.value()->value; + } + } + + // Finally check unknown components + return find_unknown_component(attr_name); + } + auto clear(void) -> void { id = Tag_UNKNOWN; diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx index 090e2af55c..a03247c8ad 100644 --- a/src/lua/lua_html.cxx +++ b/src/lua/lua_html.cxx @@ -712,8 +712,7 @@ lua_html_tag_get_attribute(lua_State *L) const char *attr_name = luaL_checklstring(L, 2, &slen); if (ltag && attr_name) { - auto maybe_attr = ltag->tag->find_component( - rspamd::html::html_component_from_string({attr_name, slen})); + auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen}); if (maybe_attr) { lua_pushlstring(L, maybe_attr->data(), maybe_attr->size());