]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add HTML URL rewriting infrastructure
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 11 Oct 2025 09:03:37 +0000 (10:03 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 11 Oct 2025 09:03:37 +0000 (10:03 +0100)
Implements infrastructure for rewriting clickable URLs in HTML content:

- Add span tracking to HTML parser to capture byte offsets of href/src attribute values
- Implement patch-based URL rewriting engine with overlap validation
- Add C→Lua glue for URL rewriting callback functions
- Support MIME re-encoding (quoted-printable, base64, 8bit) for modified content
- Add configuration options: enable_url_rewrite, url_rewrite_lua_func, url_rewrite_fold_limit

The feature allows Lua callbacks to transform URLs while preserving HTML structure
and MIME encoding. Integration with milter REPLBODY support enables message body
replacement.

src/libserver/CMakeLists.txt
src/libserver/cfg_file.h
src/libserver/cfg_rcl.cxx
src/libserver/cfg_utils.cxx
src/libserver/html/html.cxx
src/libserver/html/html.hxx
src/libserver/html/html_tag.hxx
src/libserver/html/html_url_rewrite.cxx [new file with mode: 0644]
src/libserver/html/html_url_rewrite.hxx [new file with mode: 0644]

index d3415bdb299bb54e5cf55100f18b2073302af218..73b04856d7a1223d66609d14f988ba6439d99f64 100644 (file)
@@ -42,6 +42,7 @@ SET(LIBRSPAMDSERVERSRC
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx
+        ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_tests.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/hyperscan_tools.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/backtrace.cxx
index 32168c754c937a1bef4e28d8d88d8870b0c1a254..c16db6fe98a7b3eeccf6f51f2d75ad4b07e944f9 100644 (file)
@@ -347,6 +347,7 @@ struct rspamd_config {
        char *pid_file;                  /**< name of pid file                                                                  */
        char *temp_dir;                  /**< dir for temp files                                                                        */
        char *control_socket_path;       /**< path to the control socket                                                        */
+       char *url_rewrite_lua_func;      /**< Lua function for URL rewriting                                            */
        const ucl_object_t *local_addrs; /**< tree of local addresses                                                   */
 #ifdef WITH_GPERF_TOOLS
        char *profile_path;
@@ -375,6 +376,7 @@ struct rspamd_config {
        enum rspamd_gtube_patterns_policy gtube_patterns_policy; /**< Enable test patterns                                                              */
        gboolean enable_css_parser;                              /**< Enable css parsing in HTML                                                        */
        gboolean enable_mime_utf;                                /**< Enable utf8 mime parsing                                                  */
+       gboolean enable_url_rewrite;                             /**< Enable HTML URL rewriting                                                 */
 
        gsize max_cores_size;        /**< maximum size occupied by rspamd core files                    */
        gsize max_cores_count;       /**< maximum number of core files                                          */
@@ -384,6 +386,7 @@ struct rspamd_config {
        gsize images_cache_size;     /**< size of LRU cache for DCT data from images                    */
        double task_timeout;         /**< maximum message processing time                                       */
        int default_max_shots;       /**< default maximum count of symbols hits permitted (-1 for unlimited) */
+       int url_rewrite_fold_limit;  /**< line fold limit for URL rewrite MIME encoding (default 76) */
        int32_t heartbeats_loss_max; /**< number of heartbeats lost to consider worker's termination */
        double heartbeat_interval;   /**< interval for heartbeats for workers                           */
 
index 2d8d396eec1bf39a69765e7916848203f121ac23..91503ccb4fe3086fa1f8c52ddedb8c4a06f499d5 100644 (file)
@@ -2111,6 +2111,24 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections)
                                                                           G_STRUCT_OFFSET(struct rspamd_config, enable_mime_utf),
                                                                           0,
                                                                           "Enable UTF8 mode for mime");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "enable_url_rewrite",
+                                                                          rspamd_rcl_parse_struct_boolean,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite),
+                                                                          0,
+                                                                          "Enable HTML URL rewriting");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "url_rewrite_lua_func",
+                                                                          rspamd_rcl_parse_struct_string,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, url_rewrite_lua_func),
+                                                                          0,
+                                                                          "Lua function name for URL rewriting callback");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "url_rewrite_fold_limit",
+                                                                          rspamd_rcl_parse_struct_integer,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, url_rewrite_fold_limit),
+                                                                          0,
+                                                                          "Line fold limit for MIME re-encoding (default: 76)");
                rspamd_rcl_add_default_handler(sub,
                                                                           "enable_experimental",
                                                                           rspamd_rcl_parse_struct_boolean,
index 1e96c320af4343d9716b047a9eaaedb708beb7ae..87011432a278e75983da7f612c025de57e7196b5 100644 (file)
@@ -348,6 +348,9 @@ rspamd_config_new(enum rspamd_config_init_flags flags)
 
        cfg->enable_css_parser = true;
        cfg->enable_mime_utf = false;
+       cfg->enable_url_rewrite = false;
+       cfg->url_rewrite_lua_func = nullptr;
+       cfg->url_rewrite_fold_limit = 76;
        cfg->script_modules = g_ptr_array_new();
 
        REF_INIT_RETAIN(cfg, rspamd_config_free);
index 4b1867f6588c9ae61f33fe60868b29730179e8cb..8a1439fb43d8dcc77a6a0e35f9c4489a4ad97b8b 100644 (file)
@@ -265,7 +265,7 @@ html_check_balance(struct html_content *hc,
        return nullptr;
 }
 
-auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component
+auto html_component_from_string(std::string_view name, std::string_view value, std::size_t offset, std::size_t len) -> html_tag_component
 {
        auto known_component_it = html_components_map.find(name);
 
@@ -274,7 +274,7 @@ auto html_component_from_string(std::string_view name, std::string_view value) -
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME:
                        return html_component_name{value};
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF:
-                       return html_component_href{value};
+                       return html_component_href{value, offset, len};
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR:
                        return html_component_color{value};
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR:
@@ -387,7 +387,7 @@ auto html_component_from_string(std::string_view name, std::string_view value) -
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE:
                        return html_component_title{value};
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC:
-                       return html_component_src{value};
+                       return html_component_src{value, offset, len};
                // Meta
                case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET:
                        return html_component_charset{value};
@@ -891,12 +891,15 @@ struct tag_content_parser_state {
        tag_parser_state cur_state = parse_start;
        std::string buf;
        std::string attr_name;// Store current attribute name
+       const char *value_start = nullptr;// Track where attribute value starts in input
+       const char *html_start = nullptr; // Base pointer to HTML buffer start
 
        void reset()
        {
                cur_state = parse_start;
                buf.clear();
                attr_name.clear();
+               value_start = nullptr;
        }
 };
 
@@ -924,6 +927,13 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                                attr_name_view = {name_storage, parser_env.attr_name.size()};
                        }
 
+                       // Calculate attribute value span for URL rewriting (href/src only)
+                       std::size_t value_offset = 0, value_len = 0;
+                       if (parser_env.value_start != nullptr && parser_env.html_start != nullptr) {
+                               value_offset = parser_env.value_start - parser_env.html_start;
+                               value_len = in - parser_env.value_start;
+                       }
+
                        // Store value in persistent memory if not empty
                        if (!parser_env.buf.empty()) {
                                auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
@@ -940,13 +950,14 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                                value_view = {value_storage, sz};
                        }
 
-                       // Create the appropriate component variant
-                       auto component = html_component_from_string(attr_name_view, value_view);
+                       // Create the appropriate component variant with span info
+                       auto component = html_component_from_string(attr_name_view, value_view, value_offset, value_len);
                        tag->components.emplace_back(std::move(component));
                }
 
                parser_env.buf.clear();
                parser_env.attr_name.clear();
+               parser_env.value_start = nullptr;
        };
 
        auto store_component_name = [&]() -> bool {
@@ -1098,6 +1109,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                        state = parse_start_squote;
                }
                else if (!g_ascii_isspace(*in)) {
+                       // Mark start of unquoted attribute value
+                       if (parser_env.value_start == nullptr) {
+                               parser_env.value_start = in;
+                       }
                        store_value_character(true);
                        state = parse_value;
                }
@@ -1114,6 +1129,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                        state = parse_start_squote;
                }
                else {
+                       // Mark start of unquoted attribute value
+                       if (parser_env.value_start == nullptr) {
+                               parser_env.value_start = in;
+                       }
                        store_value_character(true);
                        state = parse_value;
                }
@@ -1125,6 +1144,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                        state = spaces_after_param;
                }
                else {
+                       // Mark start of attribute value (first char inside quotes)
+                       if (parser_env.value_start == nullptr) {
+                               parser_env.value_start = in;
+                       }
                        store_value_character(false);
                        state = parse_dqvalue;
                }
@@ -1136,6 +1159,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                        state = spaces_after_param;
                }
                else {
+                       // Mark start of attribute value (first char inside quotes)
+                       if (parser_env.value_start == nullptr) {
+                               parser_env.value_start = in;
+                       }
                        store_value_character(false);
                        state = parse_sqvalue;
                }
@@ -1171,6 +1198,10 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                        state = spaces_after_param;
                }
                else {
+                       // Mark start of unquoted attribute value
+                       if (parser_env.value_start == nullptr) {
+                               parser_env.value_start = in;
+                       }
                        store_value_character(false);
                }
                break;
@@ -2475,6 +2506,7 @@ auto html_process_input(struct rspamd_task *task,
        c = p;
        end = p + process_size;
        start = c;
+       content_parser_env.html_start = start;// Initialize for span tracking
 
        while (p < end) {
                t = *p;
index 3e295ea60d7013ab417e958ab80accb9d0aba102..6f23f0fb953a4e1bb26b504f4b58503e8b11e4a7 100644 (file)
@@ -139,6 +139,35 @@ struct html_content {
                return true;
        }
 
+       /**
+        * Enumerate all clickable attributes (href, src) with their spans for URL rewriting
+        * @param callback function(tag, attr_name, span) -> bool (return false to stop iteration)
+        */
+       auto for_each_clickable_attr(fu2::function<bool(const html_tag *, std::string_view, const attr_span &)> &&callback) const -> void
+       {
+               for (const auto &tag: all_tags) {
+                       if (tag->flags & (FL_XML | FL_VIRTUAL | FL_BROKEN)) {
+                               continue;
+                       }
+
+                       // Check for tags with href or src attributes
+                       if (tag->flags & FL_HREF || tag->id == Tag_A || tag->id == Tag_IMG || tag->id == Tag_LINK || tag->id == Tag_BASE) {
+                               // Try href first
+                               if (auto span = tag->get_attr_span("href")) {
+                                       if (!callback(tag.get(), "href", span.value())) {
+                                               return;
+                                       }
+                               }
+                               // Then try src
+                               else if (auto span = tag->get_attr_span("src")) {
+                                       if (!callback(tag.get(), "src", span.value())) {
+                                               return;
+                                       }
+                               }
+                       }
+               }
+       }
+
 private:
        ~html_content() = default;
 };
index 0957cfc021a9802208461d30eba610768f06a7b0..3daa89edcfa6415f3caee5962cb5f5b617d214d3 100644 (file)
@@ -156,8 +156,10 @@ struct html_component_name : html_component_base {
 
 struct html_component_href : html_component_base {
        std::string_view value;
-       explicit constexpr html_component_href(std::string_view v)
-               : value(v)
+       std::size_t offset = 0;// offset in decoded HTML buffer
+       std::size_t len = 0;   // length of raw attribute value
+       explicit constexpr html_component_href(std::string_view v, std::size_t off = 0, std::size_t l = 0)
+               : value(v), offset(off), len(l)
        {
        }
        constexpr std::string_view get_string_value() const override
@@ -990,8 +992,10 @@ struct html_component_title : html_component_base {
 
 struct html_component_src : html_component_base {
        std::string_view value;
-       explicit html_component_src(std::string_view v)
-               : value(v)
+       std::size_t offset = 0;// offset in decoded HTML buffer
+       std::size_t len = 0;   // length of raw attribute value
+       explicit html_component_src(std::string_view v, std::size_t off = 0, std::size_t l = 0)
+               : value(v), offset(off), len(l)
        {
        }
        std::string_view get_string_value() const override
@@ -1259,9 +1263,11 @@ using html_tag_component = std::variant<
  * Returns component variant from a string
  * @param name attribute name
  * @param value attribute value
+ * @param offset offset of attribute value in decoded HTML buffer (for URL rewriting)
+ * @param len length of attribute value in decoded HTML buffer (for URL rewriting)
  * @return variant component
  */
-auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component;
+auto html_component_from_string(std::string_view name, std::string_view value, std::size_t offset = 0, std::size_t len = 0) -> html_tag_component;
 
 /* Public tags flags */
 /* XML tag */
@@ -1288,6 +1294,12 @@ struct html_closing_tag {
        }
 };
 
+/* Attribute span in decoded HTML buffer (for URL rewriting) */
+struct attr_span {
+       std::size_t offset;
+       std::size_t len;
+};
+
 struct html_tag {
        unsigned int tag_start = 0;
        unsigned int content_offset = 0;
@@ -1384,6 +1396,25 @@ struct html_tag {
                return find_component<html_component_hidden>().has_value();
        }
 
+       auto get_attr_span(std::string_view attr_name) const -> std::optional<attr_span>
+       {
+               if (attr_name == "href") {
+                       if (auto comp = find_component<html_component_href>()) {
+                               if (comp.value()->len > 0) {
+                                       return attr_span{comp.value()->offset, comp.value()->len};
+                               }
+                       }
+               }
+               else if (attr_name == "src") {
+                       if (auto comp = find_component<html_component_src>()) {
+                               if (comp.value()->len > 0) {
+                                       return attr_span{comp.value()->offset, comp.value()->len};
+                               }
+                       }
+               }
+               return std::nullopt;
+       }
+
        auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view>
        {
                for (const auto &comp: components) {
diff --git a/src/libserver/html/html_url_rewrite.cxx b/src/libserver/html/html_url_rewrite.cxx
new file mode 100644 (file)
index 0000000..b958cee
--- /dev/null
@@ -0,0 +1,338 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lua/lua_common.h"
+#include "html_url_rewrite.hxx"
+#include "html.hxx"
+#include "html_tag.hxx"
+#include "libserver/task.h"
+#include "libserver/cfg_file.h"
+#include "libserver/url.h"
+#include "libmime/message.h"
+#include "libutil/str_util.h"
+
+#include <algorithm>
+
+#define msg_debug_html_rewrite(...) rspamd_conditional_debug_fast(NULL, NULL,                                                   \
+                                                                                                                                 rspamd_task_log_id, "html_rewrite", task->task_pool->tag.uid, \
+                                                                                                                                 __FUNCTION__,                                                 \
+                                                                                                                                 __VA_ARGS__)
+
+namespace rspamd::html {
+
+/**
+ * Call Lua url_rewriter function to get replacement URL
+ * @param task Rspamd task
+ * @param func_name Lua function name (e.g., "url_rewriter")
+ * @param url Original URL string
+ * @return Replacement URL or empty optional if no replacement
+ */
+static auto call_lua_url_rewriter(struct rspamd_task *task, const char *func_name, const std::string &url)
+       -> std::optional<std::string>
+{
+       if (!func_name || !task || !task->cfg) {
+               return std::nullopt;
+       }
+
+       auto *L = RSPAMD_LUA_CFG_STATE(task->cfg);
+       if (!L) {
+               return std::nullopt;
+       }
+
+       // Push error handler
+       lua_pushcfunction(L, &rspamd_lua_traceback);
+       auto err_idx = lua_gettop(L);
+
+       // Get the function
+       if (!rspamd_lua_require_function(L, func_name, nullptr)) {
+               msg_debug_html_rewrite("cannot require function %s", func_name);
+               lua_settop(L, err_idx - 1);
+               return std::nullopt;
+       }
+
+       // Push task
+       struct rspamd_task **ptask = (struct rspamd_task **) lua_newuserdata(L, sizeof(struct rspamd_task *));
+       *ptask = task;
+       rspamd_lua_setclass(L, rspamd_task_classname, -1);
+
+       // Push URL string
+       lua_pushlstring(L, url.c_str(), url.size());
+
+       // Call function with 2 args, 1 result
+       if (lua_pcall(L, 2, 1, err_idx) != 0) {
+               msg_warn_task("call to %s failed: %s", func_name, lua_tostring(L, -1));
+               lua_settop(L, err_idx - 1);
+               return std::nullopt;
+       }
+
+       // Check return value
+       std::optional<std::string> result;
+       if (lua_type(L, -1) == LUA_TSTRING) {
+               std::size_t len;
+               const char *str = lua_tolstring(L, -1, &len);
+               if (str && len > 0) {
+                       result = std::string{str, len};
+                       msg_debug_html_rewrite("URL rewrite: %s -> %s", url.c_str(), result->c_str());
+               }
+       }
+       else if (!lua_isnil(L, -1)) {
+               msg_warn_task("%s returned non-string value", func_name);
+       }
+
+       lua_settop(L, err_idx - 1);
+       return result;
+}
+
+auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *task, int part_id)
+       -> std::vector<rewrite_candidate>
+{
+       std::vector<rewrite_candidate> candidates;
+
+       if (!hc) {
+               return candidates;
+       }
+
+       // Enumerate all clickable attributes with spans
+       hc->for_each_clickable_attr([&](const html_tag *tag, std::string_view attr_name, const attr_span &span) -> bool {
+               // Get the href or src value
+               std::string_view url_value;
+               if (attr_name == "href") {
+                       if (auto href = tag->find_href()) {
+                               url_value = href.value();
+                       }
+               }
+               else if (attr_name == "src") {
+                       if (auto src_comp = tag->find_component<html_component_src>()) {
+                               url_value = src_comp.value()->value;
+                       }
+               }
+
+               if (url_value.empty()) {
+                       return true;// Continue to next
+               }
+
+               // Skip data: and cid: schemes by default
+               if (url_value.size() >= 5) {
+                       if (url_value.substr(0, 5) == "data:" || url_value.substr(0, 4) == "cid:") {
+                               return true;// Continue to next
+                       }
+               }
+
+               // Build absolute URL (already done by parser, but we have it in url_value)
+               // For now, just use url_value as-is. In real implementation, this should
+               // handle base URL resolution if needed.
+               std::string absolute_url{url_value};
+
+               // Create candidate
+               candidates.push_back(rewrite_candidate{tag, attr_name, std::move(absolute_url), span.offset, span.len, part_id});
+
+               return true;// Continue to next
+       });
+
+       return candidates;
+}
+
+auto validate_patches(std::vector<rewrite_patch> &patches) -> bool
+{
+       if (patches.empty()) {
+               return true;
+       }
+
+       // Sort patches by part_id and offset
+       std::sort(patches.begin(), patches.end());
+
+       // Check for overlaps within same part
+       for (std::size_t i = 1; i < patches.size(); i++) {
+               const auto &prev = patches[i - 1];
+               const auto &curr = patches[i];
+
+               // If same part, check for overlap
+               if (prev.part_id == curr.part_id) {
+                       auto prev_end = prev.offset + prev.len;
+                       if (prev_end > curr.offset) {
+                               // Overlap detected
+                               return false;
+                       }
+               }
+       }
+
+       return true;
+}
+
+auto apply_patches(std::string_view original, const std::vector<rewrite_patch> &patches)
+       -> std::string
+{
+       if (patches.empty()) {
+               return std::string{original};
+       }
+
+       std::string result;
+       result.reserve(original.size() + 1024);// Reserve extra space for potential growth
+
+       std::size_t pos = 0;
+
+       for (const auto &patch: patches) {
+               // Copy everything from pos to patch.offset
+               if (patch.offset > pos) {
+                       result.append(original.substr(pos, patch.offset - pos));
+               }
+
+               // Apply the replacement
+               result.append(patch.replacement);
+
+               // Move position to after the patched region
+               pos = patch.offset + patch.len;
+       }
+
+       // Copy remaining content
+       if (pos < original.size()) {
+               result.append(original.substr(pos));
+       }
+
+       return result;
+}
+
+auto process_html_url_rewrite(struct rspamd_task *task,
+                                                         const html_content *hc,
+                                                         const char *func_name,
+                                                         int part_id,
+                                                         std::string_view original_html)
+       -> std::optional<std::string>
+{
+       if (!task || !hc || !func_name) {
+               return std::nullopt;
+       }
+
+       // Enumerate candidates
+       auto candidates = enumerate_rewrite_candidates(hc, task, part_id);
+       if (candidates.empty()) {
+               msg_debug_html_rewrite("no URL rewrite candidates found");
+               return std::nullopt;
+       }
+
+       msg_debug_html_rewrite("found %zu URL rewrite candidates", candidates.size());
+
+       // Build patches by calling Lua for each candidate
+       std::vector<rewrite_patch> patches;
+       patches.reserve(candidates.size());
+
+       for (const auto &candidate: candidates) {
+               // Call Lua callback
+               auto replacement = call_lua_url_rewriter(task, func_name, candidate.absolute_url);
+               if (!replacement) {
+                       continue;// Skip if Lua returned nil
+               }
+
+               // Create patch
+               patches.push_back(rewrite_patch{
+                       candidate.part_id,
+                       candidate.offset,
+                       candidate.len,
+                       std::move(replacement.value())});
+       }
+
+       if (patches.empty()) {
+               msg_debug_html_rewrite("no patches generated from Lua callbacks");
+               return std::nullopt;
+       }
+
+       // Validate and sort patches
+       if (!validate_patches(patches)) {
+               msg_warn_task("URL rewrite patches overlap, skipping rewrite");
+               return std::nullopt;
+       }
+
+       msg_debug_html_rewrite("applying %zu patches", patches.size());
+
+       // Apply patches
+       return apply_patches(original_html, patches);
+}
+
+auto reencode_html_content(std::string_view decoded_html,
+                                                  int cte_type,
+                                                  int fold_limit)
+       -> std::optional<std::string>
+{
+       if (decoded_html.empty()) {
+               return std::nullopt;
+       }
+
+       auto cte = static_cast<enum rspamd_cte>(cte_type);
+
+       switch (cte) {
+       case RSPAMD_CTE_7BIT:
+       case RSPAMD_CTE_8BIT:
+               // No encoding needed, return as-is
+               return std::string{decoded_html};
+
+       case RSPAMD_CTE_QP: {
+               // Encode using quoted-printable with CRLF line endings (MIME standard)
+               if (fold_limit > 0) {
+                       char *encoded = rspamd_encode_qp_fold(
+                               reinterpret_cast<const unsigned char *>(decoded_html.data()),
+                               decoded_html.size(),
+                               fold_limit,
+                               nullptr,
+                               RSPAMD_TASK_NEWLINES_CRLF);
+                       if (encoded) {
+                               std::string result{encoded};
+                               g_free(encoded);
+                               return result;
+                       }
+               }
+               return std::nullopt;
+       }
+
+       case RSPAMD_CTE_B64: {
+               // Encode using base64 with CRLF line endings (MIME standard)
+               char *encoded = nullptr;
+               if (fold_limit > 0) {
+                       encoded = rspamd_encode_base64_fold(
+                               reinterpret_cast<const unsigned char *>(decoded_html.data()),
+                               decoded_html.size(),
+                               fold_limit,
+                               nullptr,
+                               RSPAMD_TASK_NEWLINES_CRLF);
+               }
+               else {
+                       // No folding
+                       encoded = rspamd_encode_base64(
+                               reinterpret_cast<const unsigned char *>(decoded_html.data()),
+                               decoded_html.size(),
+                               -1,
+                               nullptr);
+               }
+
+               if (encoded) {
+                       std::string result{encoded};
+                       g_free(encoded);
+                       return result;
+               }
+               return std::nullopt;
+       }
+
+       case RSPAMD_CTE_UUE:
+               // UUE encoding not supported for rewriting
+               return std::nullopt;
+
+       case RSPAMD_CTE_UNKNOWN:
+       default:
+               // Unknown encoding, return decoded content
+               return std::string{decoded_html};
+       }
+}
+
+}// namespace rspamd::html
diff --git a/src/libserver/html/html_url_rewrite.hxx b/src/libserver/html/html_url_rewrite.hxx
new file mode 100644 (file)
index 0000000..ed6ffce
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_URL_REWRITE_HXX
+#define RSPAMD_HTML_URL_REWRITE_HXX
+#pragma once
+
+#include <string>
+#include <string_view>
+#include <vector>
+#include <cstddef>
+
+struct rspamd_task;
+
+namespace rspamd::html {
+
+struct html_content;
+struct html_tag;
+
+/**
+ * Candidate for URL rewriting
+ * Represents a single href/src attribute that may be rewritten
+ */
+struct rewrite_candidate {
+       const html_tag *tag;       // Tag containing the attribute
+       std::string_view attr_name;// "href" or "src"
+       std::string absolute_url;  // Absolute/canonicalized URL for Lua policy
+       std::size_t offset;        // Offset of attribute value in decoded HTML buffer
+       std::size_t len;           // Length of attribute value in decoded HTML buffer
+       int part_id;               // MIME part ID (for multi-part messages)
+};
+
+/**
+ * Patch to apply to the decoded HTML buffer
+ * Represents a single replacement operation
+ */
+struct rewrite_patch {
+       int part_id;            // MIME part ID
+       std::size_t offset;     // Offset in decoded buffer
+       std::size_t len;        // Length to replace
+       std::string replacement;// Replacement string
+
+       // For sorting patches by offset
+       bool operator<(const rewrite_patch &other) const
+       {
+               if (part_id != other.part_id) {
+                       return part_id < other.part_id;
+               }
+               return offset < other.offset;
+       }
+};
+
+/**
+ * Enumerate rewrite candidates from parsed HTML content
+ * @param hc HTML content structure
+ * @param task Rspamd task
+ * @param part_id MIME part ID
+ * @return vector of rewrite candidates
+ */
+auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *task, int part_id)
+       -> std::vector<rewrite_candidate>;
+
+/**
+ * Validate and sort patches to ensure no overlaps
+ * @param patches vector of patches to validate
+ * @return true if valid (no overlaps), false otherwise
+ */
+auto validate_patches(std::vector<rewrite_patch> &patches) -> bool;
+
+/**
+ * Apply patches to a decoded HTML buffer
+ * @param original original decoded buffer
+ * @param patches sorted, non-overlapping patches
+ * @return rewritten buffer
+ */
+auto apply_patches(std::string_view original, const std::vector<rewrite_patch> &patches)
+       -> std::string;
+
+/**
+ * Process HTML URL rewriting for a task
+ * Enumerates candidates, calls Lua callback, applies patches, and returns rewritten HTML
+ * @param task Rspamd task
+ * @param hc HTML content
+ * @param func_name Lua function name for URL rewriting
+ * @param part_id MIME part ID
+ * @param original_html Original HTML content (decoded)
+ * @return Rewritten HTML or nullopt if no changes
+ */
+auto process_html_url_rewrite(struct rspamd_task *task,
+                                                         const html_content *hc,
+                                                         const char *func_name,
+                                                         int part_id,
+                                                         std::string_view original_html)
+       -> std::optional<std::string>;
+
+/**
+ * Re-encode HTML content using MIME transfer encoding
+ * @param decoded_html Decoded HTML content (after URL rewriting)
+ * @param cte Content Transfer Encoding type (from rspamd_mime_part)
+ * @param fold_limit Line length limit for quoted-printable and base64 (0 = no folding)
+ * @return Encoded content or nullopt on error
+ */
+auto reencode_html_content(std::string_view decoded_html,
+                                                  int cte_type,
+                                                  int fold_limit = 76)
+       -> std::optional<std::string>;
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_URL_REWRITE_HXX