]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Rework system of html tags to allow more tag types
authorVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 16 Jul 2025 11:31:26 +0000 (12:31 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 16 Jul 2025 11:31:26 +0000 (12:31 +0100)
src/libserver/css/css.cxx
src/libserver/html/html.cxx
src/libserver/html/html_tag.hxx
src/lua/lua_html.cxx

index 1b369ed17252c8a0ec822d7869671bcbec236959..c53e3c05e572aeab0ca301fd6d449b8170f57bd6 100644 (file)
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector
 
 auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block *
 {
-       std::optional<std::string_view> id_comp, class_comp;
        rspamd::html::html_block *res = nullptr;
 
        if (!tag) {
@@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa
        }
 
        /* First, find id in a tag and a class */
-       for (const auto &param: tag->components) {
-               if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) {
-                       id_comp = param.value;
-               }
-               else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
-                       class_comp = param.value;
-               }
-       }
+       auto id_comp = tag->find_id();
+       auto class_comp = tag->find_class();
 
        /* ID part */
        if (id_comp && !pimpl->id_selectors.empty()) {
@@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool,
        return std::make_pair(nullptr, parse_res.error());
 }
 
-}// namespace rspamd::css
\ No newline at end of file
+}// namespace rspamd::css
index 93d1fdf91b32a57e62c2580ef1316ccd364a9003..5597b7eb5705dbaa84dd64287f9db1fdc0db5d4d 100644 (file)
@@ -199,15 +199,44 @@ html_check_balance(struct html_content *hc,
        return nullptr;
 }
 
-auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
+auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component
 {
-       auto known_component_it = html_components_map.find(st);
+       auto known_component_it = html_components_map.find(name);
 
        if (known_component_it != html_components_map.end()) {
-               return known_component_it->second;
+               switch (known_component_it->second) {
+               case html_component_type::RSPAMD_HTML_COMPONENT_NAME:
+                       return html_component_name{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_HREF:
+                       return html_component_href{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_COLOR:
+                       return html_component_color{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR:
+                       return html_component_bgcolor{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_STYLE:
+                       return html_component_style{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_CLASS:
+                       return html_component_class{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_WIDTH:
+                       return html_component_width{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT:
+                       return html_component_height{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_SIZE:
+                       return html_component_size{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_REL:
+                       return html_component_rel{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_ALT:
+                       return html_component_alt{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_ID:
+                       return html_component_id{value};
+               case html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN:
+                       return html_component_hidden{};
+               default:
+                       return html_component_unknown{name, value};
+               }
        }
        else {
-               return std::nullopt;
+               return html_component_unknown{name, value};
        }
 }
 
@@ -234,13 +263,13 @@ enum tag_parser_state {
 struct tag_content_parser_state {
        tag_parser_state cur_state = parse_start;
        std::string buf;
-       std::optional<html_component_type> cur_component;
+       std::string attr_name;// Store current attribute name
 
        void reset()
        {
                cur_state = parse_start;
                buf.clear();
-               cur_component = std::nullopt;
+               attr_name.clear();
        }
 };
 
@@ -254,56 +283,50 @@ html_parse_tag_content(rspamd_mempool_t *pool,
        auto state = parser_env.cur_state;
 
        /*
-        * Stores tag component if it doesn't exist, performing copy of the
-        * value + decoding of the entities
-        * Parser env is set to clear the current html attribute fields (saved_p and
-        * cur_component)
+        * Stores tag component creating the appropriate variant type
+        * Parser env is cleared after storing
         */
        auto store_component_value = [&]() -> void {
-               if (parser_env.cur_component) {
+               if (!parser_env.attr_name.empty()) {
+                       std::string_view attr_name_view, value_view;
 
-                       if (parser_env.buf.empty()) {
-                               tag->components.emplace_back(parser_env.cur_component.value(),
-                                                                                        std::string_view{});
+                       // Store attribute name in persistent memory
+                       if (!parser_env.attr_name.empty()) {
+                               auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size());
+                               memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size());
+                               attr_name_view = {name_storage, parser_env.attr_name.size()};
                        }
-                       else {
-                               /* We need to copy buf to a persistent storage */
-                               auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
 
-                               if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
-                                       parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
-                                       /* Lowercase */
-                                       rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
+                       // Store value in persistent memory if not empty
+                       if (!parser_env.buf.empty()) {
+                               auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
+
+                               // Lowercase for id and class attributes
+                               if (parser_env.attr_name == "id" || parser_env.attr_name == "class") {
+                                       rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size());
                                }
                                else {
-                                       memcpy(s, parser_env.buf.data(), parser_env.buf.size());
+                                       memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size());
                                }
 
-                               auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
-                               tag->components.emplace_back(parser_env.cur_component.value(),
-                                                                                        std::string_view{s, sz});
+                               auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size());
+                               value_view = {value_storage, sz};
                        }
+
+                       // Create the appropriate component variant
+                       auto component = html_component_from_string(attr_name_view, value_view);
+                       tag->components.emplace_back(std::move(component));
                }
 
                parser_env.buf.clear();
-               parser_env.cur_component = std::nullopt;
+               parser_env.attr_name.clear();
        };
 
        auto store_component_name = [&]() -> bool {
                decode_html_entitles_inplace(parser_env.buf);
-               auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
+               parser_env.attr_name = parser_env.buf;
                parser_env.buf.clear();
-
-               if (known_component_it != html_components_map.end()) {
-                       parser_env.cur_component = known_component_it->second;
-
-                       return true;
-               }
-               else {
-                       parser_env.cur_component = std::nullopt;
-               }
-
-               return false;
+               return true;
        };
 
        auto store_value_character = [&](bool lc) -> void {
@@ -620,7 +643,7 @@ html_process_url_tag(rspamd_mempool_t *pool,
                                         struct html_tag *tag,
                                         struct html_content *hc) -> std::optional<struct rspamd_url *>
 {
-       auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
+       auto found_href_maybe = tag->find_href();
 
        if (found_href_maybe) {
                /* Check base url */
@@ -816,130 +839,117 @@ html_process_img_tag(rspamd_mempool_t *pool,
        img = rspamd_mempool_alloc0_type(pool, struct html_image);
        img->tag = tag;
 
-       for (const auto &param: tag->components) {
+       // Process HREF component
+       if (auto href_value = tag->find_href()) {
+               if (href_value->size() > 0) {
+                       rspamd_ftok_t fstr;
+                       fstr.begin = href_value->data();
+                       fstr.len = href_value->size();
+                       img->src = rspamd_mempool_ftokdup(pool, &fstr);
 
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
-                       /* Check base url */
-                       const auto &href_value = param.value;
-
-                       if (href_value.size() > 0) {
-                               rspamd_ftok_t fstr;
-                               fstr.begin = href_value.data();
-                               fstr.len = href_value.size();
-                               img->src = rspamd_mempool_ftokdup(pool, &fstr);
-
-                               if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
-                                                                                                                                        "cid:", sizeof("cid:") - 1) == 0) {
-                                       /* We have an embedded image */
-                                       img->src += sizeof("cid:") - 1;
-                                       img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
+                       if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(),
+                                                                                                                                 "cid:", sizeof("cid:") - 1) == 0) {
+                               /* We have an embedded image */
+                               img->src += sizeof("cid:") - 1;
+                               img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
+                       }
+                       else {
+                               if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(),
+                                                                                                                                          "data:", sizeof("data:") - 1) == 0) {
+                                       /* We have an embedded image in HTML tag */
+                                       img->flags |=
+                                               (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
+                                       html_process_data_image(pool, img, *href_value);
+                                       hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
                                }
                                else {
-                                       if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
-                                                                                                                                                 "data:", sizeof("data:") - 1) == 0) {
-                                               /* We have an embedded image in HTML tag */
-                                               img->flags |=
-                                                       (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
-                                               html_process_data_image(pool, img, href_value);
-                                               hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
-                                       }
-                                       else {
-                                               img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
-                                               if (img->src) {
-
-                                                       std::string_view cpy{href_value};
-                                                       auto maybe_url = html_process_url(pool, cpy);
-
-                                                       if (maybe_url) {
-                                                               img->url = maybe_url.value();
-                                                               struct rspamd_url *existing;
-
-                                                               img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
-                                                               existing = rspamd_url_set_add_or_return(url_set,
-                                                                                                                                               img->url);
-
-                                                               if (existing && existing != img->url) {
-                                                                       /*
-                                                                        * We have some other URL that could be
-                                                                        * found, e.g. from another part. However,
-                                                                        * we still want to set an image flag on it
-                                                                        */
-                                                                       existing->flags |= img->url->flags;
-                                                                       existing->count++;
-                                                               }
-                                                               else if (part_urls) {
-                                                                       /* New url */
-                                                                       g_ptr_array_add(part_urls, img->url);
-                                                               }
+                                       img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
+                                       if (img->src) {
+
+                                               std::string_view cpy{*href_value};
+                                               auto maybe_url = html_process_url(pool, cpy);
+
+                                               if (maybe_url) {
+                                                       img->url = maybe_url.value();
+                                                       struct rspamd_url *existing;
+
+                                                       img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
+                                                       existing = rspamd_url_set_add_or_return(url_set,
+                                                                                                                                       img->url);
+
+                                                       if (existing && existing != img->url) {
+                                                               /*
+                                                                * We have some other URL that could be
+                                                                * found, e.g. from another part. However,
+                                                                * we still want to set an image flag on it
+                                                                */
+                                                               existing->flags |= img->url->flags;
+                                                               existing->count++;
+                                                       }
+                                                       else if (part_urls) {
+                                                               /* New url */
+                                                               g_ptr_array_add(part_urls, img->url);
                                                        }
                                                }
                                        }
                                }
                        }
                }
+       }
 
+       // Process numeric dimensions using the new helper methods
+       if (auto height = tag->find_height()) {
+               img->height = height.value();
+       }
 
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
-                       unsigned long val;
-
-                       rspamd_strtoul(param.value.data(), param.value.size(), &val);
-                       img->height = val;
-               }
-
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
-                       unsigned long val;
-
-                       rspamd_strtoul(param.value.data(), param.value.size(), &val);
-                       img->width = val;
-               }
-
-               /* TODO: rework to css at some time */
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
-                       if (img->height == 0) {
-                               auto style_st = param.value;
-                               auto pos = rspamd_substring_search_caseless(style_st.data(),
-                                                                                                                       style_st.size(),
-                                                                                                                       "height", sizeof("height") - 1);
-                               if (pos != -1) {
-                                       auto substr = style_st.substr(pos + sizeof("height") - 1);
+       if (auto width = tag->find_width()) {
+               img->width = width.value();
+       }
 
-                                       for (auto i = 0; i < substr.size(); i++) {
-                                               auto t = substr[i];
-                                               if (g_ascii_isdigit(t)) {
-                                                       unsigned long val;
-                                                       rspamd_strtoul(substr.data(),
-                                                                                  substr.size(), &val);
-                                                       img->height = val;
-                                                       break;
-                                               }
-                                               else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
-                                                       /* Fallback */
-                                                       break;
-                                               }
+       // Process style component for dimensions
+       if (auto style_value = tag->find_style()) {
+               if (img->height == 0) {
+                       auto pos = rspamd_substring_search_caseless(style_value->data(),
+                                                                                                               style_value->size(),
+                                                                                                               "height", sizeof("height") - 1);
+                       if (pos != -1) {
+                               auto substr = style_value->substr(pos + sizeof("height") - 1);
+
+                               for (auto i = 0; i < substr.size(); i++) {
+                                       auto t = substr[i];
+                                       if (g_ascii_isdigit(t)) {
+                                               unsigned long val;
+                                               rspamd_strtoul(substr.data(),
+                                                                          substr.size(), &val);
+                                               img->height = val;
+                                               break;
+                                       }
+                                       else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+                                               /* Fallback */
+                                               break;
                                        }
                                }
                        }
-                       if (img->width == 0) {
-                               auto style_st = param.value;
-                               auto pos = rspamd_substring_search_caseless(style_st.data(),
-                                                                                                                       style_st.size(),
-                                                                                                                       "width", sizeof("width") - 1);
-                               if (pos != -1) {
-                                       auto substr = style_st.substr(pos + sizeof("width") - 1);
-
-                                       for (auto i = 0; i < substr.size(); i++) {
-                                               auto t = substr[i];
-                                               if (g_ascii_isdigit(t)) {
-                                                       unsigned long val;
-                                                       rspamd_strtoul(substr.data(),
-                                                                                  substr.size(), &val);
-                                                       img->width = val;
-                                                       break;
-                                               }
-                                               else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
-                                                       /* Fallback */
-                                                       break;
-                                               }
+               }
+               if (img->width == 0) {
+                       auto pos = rspamd_substring_search_caseless(style_value->data(),
+                                                                                                               style_value->size(),
+                                                                                                               "width", sizeof("width") - 1);
+                       if (pos != -1) {
+                               auto substr = style_value->substr(pos + sizeof("width") - 1);
+
+                               for (auto i = 0; i < substr.size(); i++) {
+                                       auto t = substr[i];
+                                       if (g_ascii_isdigit(t)) {
+                                               unsigned long val;
+                                               rspamd_strtoul(substr.data(),
+                                                                          substr.size(), &val);
+                                               img->width = val;
+                                               break;
+                                       }
+                                       else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+                                               /* Fallback */
+                                               break;
                                        }
                                }
                        }
@@ -968,7 +978,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
                                          khash_t(rspamd_url_hash) * url_set,
                                          GPtrArray *part_urls) -> void
 {
-       auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
+       auto found_rel_maybe = tag->find_rel();
 
        if (found_rel_maybe) {
                if (found_rel_maybe.value() == "icon") {
@@ -984,24 +994,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
        std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
        bool hidden = false;
 
-       for (const auto &param: tag->components) {
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
-                       maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
-               }
-
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
-                       maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
-               }
+       // Process color components
+       if (auto color_comp = tag->find_component<html_component_color>()) {
+               maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value);
+       }
 
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
-                       tag->block = rspamd::css::parse_css_declaration(pool, param.value);
-               }
+       if (auto bgcolor_comp = tag->find_component<html_component_bgcolor>()) {
+               maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value);
+       }
 
-               if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
-                       hidden = true;
-               }
+       // Process style component
+       if (auto style_value = tag->find_style()) {
+               tag->block = rspamd::css::parse_css_declaration(pool, *style_value);
        }
 
+       // Check if hidden
+       hidden = tag->is_hidden();
+
        if (!tag->block) {
                tag->block = html_block::undefined_html_block_pool(pool);
        }
@@ -1284,7 +1293,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                }
                else if (tag->id == Tag_IMG) {
                        /* Process ALT if presented */
-                       auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
+                       auto maybe_alt = tag->find_alt();
 
                        if (maybe_alt) {
                                if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
@@ -1384,9 +1393,7 @@ auto html_process_input(struct rspamd_task *task,
                overflow_input = true;
        }
 
-       auto new_tag = [&](int flags = 0) -> struct html_tag *
-       {
-
+       auto new_tag = [&](int flags = 0) -> struct html_tag * {
                if (hc->all_tags.size() > rspamd::html::max_tags) {
                        hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
 
@@ -2151,7 +2158,7 @@ auto html_process_input(struct rspamd_task *task,
        /* Leftover after content */
        switch (state) {
        case tags_limit_overflow:
-               html_append_parsed(hc, {c, (std::size_t)(end - c)},
+               html_append_parsed(hc, {c, (std::size_t) (end - c)},
                                                   false, end - start, hc->parsed);
                break;
        default:
@@ -2390,4 +2397,4 @@ gsize rspamd_html_get_tags_count(void *html_content)
        }
 
        return hc->all_tags.size();
-}
\ No newline at end of file
+}
index 309d76177ebbbbb4cdf9283d48636cdc919e1814..a6b366a91372ccd8490365aae8c7e15f387420e4 100644 (file)
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,6 +26,7 @@
 #include <cstdint>
 
 #include "html_tags.h"
+#include "libutil/str_util.h"
 
 struct rspamd_url;
 struct html_image;
@@ -34,6 +35,7 @@ namespace rspamd::html {
 
 struct html_content; /* Forward declaration */
 
+// Internal enum for mapping (not exposed in public API)
 enum class html_component_type : std::uint8_t {
        RSPAMD_HTML_COMPONENT_NAME = 0,
        RSPAMD_HTML_COMPONENT_HREF,
@@ -50,6 +52,269 @@ enum class html_component_type : std::uint8_t {
        RSPAMD_HTML_COMPONENT_HIDDEN,
 };
 
+// Forward declarations for component types
+struct html_component_name;
+struct html_component_href;
+struct html_component_color;
+struct html_component_bgcolor;
+struct html_component_style;
+struct html_component_class;
+struct html_component_width;
+struct html_component_height;
+struct html_component_size;
+struct html_component_rel;
+struct html_component_alt;
+struct html_component_id;
+struct html_component_hidden;
+struct html_component_unknown;
+
+// Base interface for all components
+struct html_component_base {
+       virtual ~html_component_base() = default;
+       virtual std::string_view get_string_value() const = 0;
+};
+
+// String-based components
+struct html_component_name : html_component_base {
+       std::string_view value;
+       explicit html_component_name(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_href : html_component_base {
+       std::string_view value;
+       explicit html_component_href(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_style : html_component_base {
+       std::string_view value;
+       explicit html_component_style(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_class : html_component_base {
+       std::string_view value;
+       explicit html_component_class(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_rel : html_component_base {
+       std::string_view value;
+       explicit html_component_rel(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_alt : html_component_base {
+       std::string_view value;
+       explicit html_component_alt(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_id : html_component_base {
+       std::string_view value;
+       explicit html_component_id(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+// Color components (could be extended to parse actual colors)
+struct html_component_color : html_component_base {
+       std::string_view value;
+       explicit html_component_color(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+struct html_component_bgcolor : html_component_base {
+       std::string_view value;
+       explicit html_component_bgcolor(std::string_view v)
+               : value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+};
+
+// Numeric components
+struct html_component_width : html_component_base {
+       std::string_view raw_value;
+       std::optional<std::uint32_t> numeric_value;
+
+       explicit html_component_width(const std::string_view v)
+               : raw_value(v)
+       {
+               unsigned long val;
+               if (rspamd_strtoul(v.data(), v.size(), &val)) {
+                       numeric_value = static_cast<std::uint32_t>(val);
+               }
+       }
+
+       std::string_view get_string_value() const override
+       {
+               return raw_value;
+       }
+       std::optional<std::uint32_t> get_numeric_value() const
+       {
+               return numeric_value;
+       }
+};
+
+struct html_component_height : html_component_base {
+       std::string_view raw_value;
+       std::optional<std::uint32_t> numeric_value;
+
+       explicit html_component_height(const std::string_view v)
+               : raw_value(v)
+       {
+               unsigned long val;
+               if (rspamd_strtoul(v.data(), v.size(), &val)) {
+                       numeric_value = static_cast<std::uint32_t>(val);
+               }
+       }
+
+       std::string_view get_string_value() const override
+       {
+               return raw_value;
+       }
+       std::optional<std::uint32_t> get_numeric_value() const
+       {
+               return numeric_value;
+       }
+};
+
+struct html_component_size : html_component_base {
+       std::string_view raw_value;
+       std::optional<std::uint32_t> numeric_value;
+
+       explicit html_component_size(std::string_view v)
+               : raw_value(v)
+       {
+               unsigned long val;
+               if (rspamd_strtoul(v.data(), v.size(), &val)) {
+                       numeric_value = static_cast<std::uint32_t>(val);
+               }
+       }
+
+       std::string_view get_string_value() const override
+       {
+               return raw_value;
+       }
+       std::optional<std::uint32_t> get_numeric_value() const
+       {
+               return numeric_value;
+       }
+};
+
+// Boolean/flag component
+struct html_component_hidden : html_component_base {
+       bool present;
+       explicit html_component_hidden()
+               : present(true)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return present ? "true" : "false";
+       }
+       bool is_present() const
+       {
+               return present;
+       }
+};
+
+// Unknown component with both name and value
+struct html_component_unknown : html_component_base {
+       std::string_view name;
+       std::string_view value;
+
+       html_component_unknown(std::string_view n, std::string_view v)
+               : name(n), value(v)
+       {
+       }
+       std::string_view get_string_value() const override
+       {
+               return value;
+       }
+       std::string_view get_name() const
+       {
+               return name;
+       }
+};
+
+// The variant type that holds all possible components
+using html_tag_component = std::variant<
+       html_component_name,
+       html_component_href,
+       html_component_color,
+       html_component_bgcolor,
+       html_component_style,
+       html_component_class,
+       html_component_width,
+       html_component_height,
+       html_component_size,
+       html_component_rel,
+       html_component_alt,
+       html_component_id,
+       html_component_hidden,
+       html_component_unknown>;
+
+/**
+ * Returns component variant from a string
+ * @param name attribute name
+ * @param value attribute value
+ * @return variant component
+ */
+auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component;
+
 /* Public tags flags */
 /* XML tag */
 #define FL_XML (1u << CM_USER_SHIFT)
@@ -62,23 +327,7 @@ enum class html_component_type : std::uint8_t {
 #define FL_COMMENT (1 << (CM_USER_SHIFT + 6))
 #define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7))
 
-/**
- * Returns component type from a string
- * @param st
- * @return
- */
-auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>;
-
 using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>;
-struct html_tag_component {
-       html_component_type type;
-       std::string_view value;
-
-       html_tag_component(html_component_type type, std::string_view value)
-               : type(type), value(value)
-       {
-       }
-};
 
 /* Pairing closing tag representation */
 struct html_closing_tag {
@@ -105,26 +354,174 @@ struct html_tag {
        std::vector<struct html_tag *> children;
        struct html_tag *parent;
 
-       auto find_component(html_component_type what) const -> std::optional<std::string_view>
+       // Template method to find component by type
+       template<typename T>
+       auto find_component() const -> std::optional<const T *>
        {
                for (const auto &comp: components) {
-                       if (comp.type == what) {
-                               return comp.value;
+                       if (std::holds_alternative<T>(comp)) {
+                               return &std::get<T>(comp);
                        }
                }
+               return std::nullopt;
+       }
 
+       // Helper methods for common component access
+       auto find_href() const -> std::optional<std::string_view>
+       {
+               if (auto comp = find_component<html_component_href>()) {
+                       return comp.value()->value;
+               }
                return std::nullopt;
        }
 
-       auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view>
+       auto find_class() const -> std::optional<std::string_view>
        {
-               if (what) {
-                       return find_component(what.value());
+               if (auto comp = find_component<html_component_class>()) {
+                       return comp.value()->value;
                }
+               return std::nullopt;
+       }
+
+       auto find_id() const -> std::optional<std::string_view>
+       {
+               if (auto comp = find_component<html_component_id>()) {
+                       return comp.value()->value;
+               }
+               return std::nullopt;
+       }
 
+       auto find_width() const -> std::optional<std::uint32_t>
+       {
+               if (auto comp = find_component<html_component_width>()) {
+                       return comp.value()->get_numeric_value();
+               }
+               return std::nullopt;
+       }
+
+       auto find_height() const -> std::optional<std::uint32_t>
+       {
+               if (auto comp = find_component<html_component_height>()) {
+                       return comp.value()->get_numeric_value();
+               }
+               return std::nullopt;
+       }
+
+       auto find_style() const -> std::optional<std::string_view>
+       {
+               if (auto comp = find_component<html_component_style>()) {
+                       return comp.value()->value;
+               }
                return std::nullopt;
        }
 
+       auto find_alt() const -> std::optional<std::string_view>
+       {
+               if (auto comp = find_component<html_component_alt>()) {
+                       return comp.value()->value;
+               }
+               return std::nullopt;
+       }
+
+       auto find_rel() const -> std::optional<std::string_view>
+       {
+               if (auto comp = find_component<html_component_rel>()) {
+                       return comp.value()->value;
+               }
+               return std::nullopt;
+       }
+
+       auto is_hidden() const -> bool
+       {
+               return find_component<html_component_hidden>().has_value();
+       }
+
+       auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view>
+       {
+               for (const auto &comp: components) {
+                       if (std::holds_alternative<html_component_unknown>(comp)) {
+                               const auto &unknown = std::get<html_component_unknown>(comp);
+                               if (unknown.name == attr_name) {
+                                       return unknown.value;
+                               }
+                       }
+               }
+               return std::nullopt;
+       }
+
+       auto get_unknown_components() const -> std::vector<std::pair<std::string_view, std::string_view>>
+       {
+               std::vector<std::pair<std::string_view, std::string_view>> unknown_attrs;
+               for (const auto &comp: components) {
+                       if (std::holds_alternative<html_component_unknown>(comp)) {
+                               const auto &unknown = std::get<html_component_unknown>(comp);
+                               unknown_attrs.emplace_back(unknown.name, unknown.value);
+                       }
+               }
+               return unknown_attrs;
+       }
+
+       // Generic visitor method for processing all components
+       template<typename Visitor>
+       auto visit_components(Visitor &&visitor) const
+       {
+               for (const auto &comp: components) {
+                       std::visit(std::forward<Visitor>(visitor), comp);
+               }
+       }
+
+       // Find any component by attribute name (for Lua bindings and generic access)
+       auto find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>
+       {
+               // Check known component types first using their helper methods
+               if (attr_name == "href") return find_href();
+               if (attr_name == "class") return find_class();
+               if (attr_name == "id") return find_id();
+               if (attr_name == "style") return find_style();
+               if (attr_name == "alt") return find_alt();
+               if (attr_name == "rel") return find_rel();
+               if (attr_name == "hidden") return is_hidden() ? std::optional<std::string_view>{"true"} : std::nullopt;
+
+               // Handle numeric components that need string conversion
+               if (attr_name == "width") {
+                       if (auto comp = find_component<html_component_width>()) {
+                               return comp.value()->get_string_value();
+                       }
+               }
+               if (attr_name == "height") {
+                       if (auto comp = find_component<html_component_height>()) {
+                               return comp.value()->get_string_value();
+                       }
+               }
+               if (attr_name == "size") {
+                       if (auto comp = find_component<html_component_size>()) {
+                               return comp.value()->get_string_value();
+                       }
+               }
+
+               // Handle color components
+               if (attr_name == "color") {
+                       if (auto comp = find_component<html_component_color>()) {
+                               return comp.value()->value;
+                       }
+               }
+               if (attr_name == "bgcolor") {
+                       if (auto comp = find_component<html_component_bgcolor>()) {
+                               return comp.value()->value;
+                       }
+               }
+
+               // Handle name component
+               if (attr_name == "name") {
+                       if (auto comp = find_component<html_component_name>()) {
+                               return comp.value()->value;
+                       }
+               }
+
+               // Finally check unknown components
+               return find_unknown_component(attr_name);
+       }
+
        auto clear(void) -> void
        {
                id = Tag_UNKNOWN;
index 090e2af55ce3464f95a461dd2bfefea061dc3c8c..a03247c8ade121434e7523342606ef332f7ef428 100644 (file)
@@ -712,8 +712,7 @@ lua_html_tag_get_attribute(lua_State *L)
        const char *attr_name = luaL_checklstring(L, 2, &slen);
 
        if (ltag && attr_name) {
-               auto maybe_attr = ltag->tag->find_component(
-                       rspamd::html::html_component_from_string({attr_name, slen}));
+               auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen});
 
                if (maybe_attr) {
                        lua_pushlstring(L, maybe_attr->data(), maybe_attr->size());