From 6452583c7a76b4374ffcfad6e0fce057f5320447 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 11 Oct 2025 10:03:37 +0100 Subject: [PATCH] [Feature] Add HTML URL rewriting infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Implements infrastructure for rewriting clickable URLs in HTML content: - Add span tracking to HTML parser to capture byte offsets of href/src attribute values - Implement patch-based URL rewriting engine with overlap validation - Add C→Lua glue for URL rewriting callback functions - Support MIME re-encoding (quoted-printable, base64, 8bit) for modified content - Add configuration options: enable_url_rewrite, url_rewrite_lua_func, url_rewrite_fold_limit The feature allows Lua callbacks to transform URLs while preserving HTML structure and MIME encoding. Integration with milter REPLBODY support enables message body replacement. --- src/libserver/CMakeLists.txt | 1 + src/libserver/cfg_file.h | 3 + src/libserver/cfg_rcl.cxx | 18 ++ src/libserver/cfg_utils.cxx | 3 + src/libserver/html/html.cxx | 42 ++- src/libserver/html/html.hxx | 29 ++ src/libserver/html/html_tag.hxx | 41 ++- src/libserver/html/html_url_rewrite.cxx | 338 ++++++++++++++++++++++++ src/libserver/html/html_url_rewrite.hxx | 123 +++++++++ 9 files changed, 588 insertions(+), 10 deletions(-) create mode 100644 src/libserver/html/html_url_rewrite.cxx create mode 100644 src/libserver/html/html_url_rewrite.hxx diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt index d3415bdb29..73b04856d7 100644 --- a/src/libserver/CMakeLists.txt +++ b/src/libserver/CMakeLists.txt @@ -42,6 +42,7 @@ SET(LIBRSPAMDSERVERSRC ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html_tests.cxx ${CMAKE_CURRENT_SOURCE_DIR}/hyperscan_tools.cxx ${CMAKE_CURRENT_SOURCE_DIR}/backtrace.cxx diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 32168c754c..c16db6fe98 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -347,6 +347,7 @@ struct rspamd_config { char *pid_file; /**< name of pid file */ char *temp_dir; /**< dir for temp files */ char *control_socket_path; /**< path to the control socket */ + char *url_rewrite_lua_func; /**< Lua function for URL rewriting */ const ucl_object_t *local_addrs; /**< tree of local addresses */ #ifdef WITH_GPERF_TOOLS char *profile_path; @@ -375,6 +376,7 @@ struct rspamd_config { enum rspamd_gtube_patterns_policy gtube_patterns_policy; /**< Enable test patterns */ gboolean enable_css_parser; /**< Enable css parsing in HTML */ gboolean enable_mime_utf; /**< Enable utf8 mime parsing */ + gboolean enable_url_rewrite; /**< Enable HTML URL rewriting */ gsize max_cores_size; /**< maximum size occupied by rspamd core files */ gsize max_cores_count; /**< maximum number of core files */ @@ -384,6 +386,7 @@ struct rspamd_config { gsize images_cache_size; /**< size of LRU cache for DCT data from images */ double task_timeout; /**< maximum message processing time */ int default_max_shots; /**< default maximum count of symbols hits permitted (-1 for unlimited) */ + int url_rewrite_fold_limit; /**< line fold limit for URL rewrite MIME encoding (default 76) */ int32_t heartbeats_loss_max; /**< number of heartbeats lost to consider worker's termination */ double heartbeat_interval; /**< interval for heartbeats for workers */ diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx index 2d8d396eec..91503ccb4f 100644 --- a/src/libserver/cfg_rcl.cxx +++ b/src/libserver/cfg_rcl.cxx @@ -2111,6 +2111,24 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) G_STRUCT_OFFSET(struct rspamd_config, enable_mime_utf), 0, "Enable UTF8 mode for mime"); + rspamd_rcl_add_default_handler(sub, + "enable_url_rewrite", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite), + 0, + "Enable HTML URL rewriting"); + rspamd_rcl_add_default_handler(sub, + "url_rewrite_lua_func", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, url_rewrite_lua_func), + 0, + "Lua function name for URL rewriting callback"); + rspamd_rcl_add_default_handler(sub, + "url_rewrite_fold_limit", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, url_rewrite_fold_limit), + 0, + "Line fold limit for MIME re-encoding (default: 76)"); rspamd_rcl_add_default_handler(sub, "enable_experimental", rspamd_rcl_parse_struct_boolean, diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx index 1e96c320af..87011432a2 100644 --- a/src/libserver/cfg_utils.cxx +++ b/src/libserver/cfg_utils.cxx @@ -348,6 +348,9 @@ rspamd_config_new(enum rspamd_config_init_flags flags) cfg->enable_css_parser = true; cfg->enable_mime_utf = false; + cfg->enable_url_rewrite = false; + cfg->url_rewrite_lua_func = nullptr; + cfg->url_rewrite_fold_limit = 76; cfg->script_modules = g_ptr_array_new(); REF_INIT_RETAIN(cfg, rspamd_config_free); diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 4b1867f658..8a1439fb43 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -265,7 +265,7 @@ html_check_balance(struct html_content *hc, return nullptr; } -auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component +auto html_component_from_string(std::string_view name, std::string_view value, std::size_t offset, std::size_t len) -> html_tag_component { auto known_component_it = html_components_map.find(name); @@ -274,7 +274,7 @@ auto html_component_from_string(std::string_view name, std::string_view value) - case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME: return html_component_name{value}; case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF: - return html_component_href{value}; + return html_component_href{value, offset, len}; case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR: return html_component_color{value}; case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR: @@ -387,7 +387,7 @@ auto html_component_from_string(std::string_view name, std::string_view value) - case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE: return html_component_title{value}; case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC: - return html_component_src{value}; + return html_component_src{value, offset, len}; // Meta case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET: return html_component_charset{value}; @@ -891,12 +891,15 @@ struct tag_content_parser_state { tag_parser_state cur_state = parse_start; std::string buf; std::string attr_name;// Store current attribute name + const char *value_start = nullptr;// Track where attribute value starts in input + const char *html_start = nullptr; // Base pointer to HTML buffer start void reset() { cur_state = parse_start; buf.clear(); attr_name.clear(); + value_start = nullptr; } }; @@ -924,6 +927,13 @@ html_parse_tag_content(rspamd_mempool_t *pool, attr_name_view = {name_storage, parser_env.attr_name.size()}; } + // Calculate attribute value span for URL rewriting (href/src only) + std::size_t value_offset = 0, value_len = 0; + if (parser_env.value_start != nullptr && parser_env.html_start != nullptr) { + value_offset = parser_env.value_start - parser_env.html_start; + value_len = in - parser_env.value_start; + } + // Store value in persistent memory if not empty if (!parser_env.buf.empty()) { auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); @@ -940,13 +950,14 @@ html_parse_tag_content(rspamd_mempool_t *pool, value_view = {value_storage, sz}; } - // Create the appropriate component variant - auto component = html_component_from_string(attr_name_view, value_view); + // Create the appropriate component variant with span info + auto component = html_component_from_string(attr_name_view, value_view, value_offset, value_len); tag->components.emplace_back(std::move(component)); } parser_env.buf.clear(); parser_env.attr_name.clear(); + parser_env.value_start = nullptr; }; auto store_component_name = [&]() -> bool { @@ -1098,6 +1109,10 @@ html_parse_tag_content(rspamd_mempool_t *pool, state = parse_start_squote; } else if (!g_ascii_isspace(*in)) { + // Mark start of unquoted attribute value + if (parser_env.value_start == nullptr) { + parser_env.value_start = in; + } store_value_character(true); state = parse_value; } @@ -1114,6 +1129,10 @@ html_parse_tag_content(rspamd_mempool_t *pool, state = parse_start_squote; } else { + // Mark start of unquoted attribute value + if (parser_env.value_start == nullptr) { + parser_env.value_start = in; + } store_value_character(true); state = parse_value; } @@ -1125,6 +1144,10 @@ html_parse_tag_content(rspamd_mempool_t *pool, state = spaces_after_param; } else { + // Mark start of attribute value (first char inside quotes) + if (parser_env.value_start == nullptr) { + parser_env.value_start = in; + } store_value_character(false); state = parse_dqvalue; } @@ -1136,6 +1159,10 @@ html_parse_tag_content(rspamd_mempool_t *pool, state = spaces_after_param; } else { + // Mark start of attribute value (first char inside quotes) + if (parser_env.value_start == nullptr) { + parser_env.value_start = in; + } store_value_character(false); state = parse_sqvalue; } @@ -1171,6 +1198,10 @@ html_parse_tag_content(rspamd_mempool_t *pool, state = spaces_after_param; } else { + // Mark start of unquoted attribute value + if (parser_env.value_start == nullptr) { + parser_env.value_start = in; + } store_value_character(false); } break; @@ -2475,6 +2506,7 @@ auto html_process_input(struct rspamd_task *task, c = p; end = p + process_size; start = c; + content_parser_env.html_start = start;// Initialize for span tracking while (p < end) { t = *p; diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 3e295ea60d..6f23f0fb95 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -139,6 +139,35 @@ struct html_content { return true; } + /** + * Enumerate all clickable attributes (href, src) with their spans for URL rewriting + * @param callback function(tag, attr_name, span) -> bool (return false to stop iteration) + */ + auto for_each_clickable_attr(fu2::function &&callback) const -> void + { + for (const auto &tag: all_tags) { + if (tag->flags & (FL_XML | FL_VIRTUAL | FL_BROKEN)) { + continue; + } + + // Check for tags with href or src attributes + if (tag->flags & FL_HREF || tag->id == Tag_A || tag->id == Tag_IMG || tag->id == Tag_LINK || tag->id == Tag_BASE) { + // Try href first + if (auto span = tag->get_attr_span("href")) { + if (!callback(tag.get(), "href", span.value())) { + return; + } + } + // Then try src + else if (auto span = tag->get_attr_span("src")) { + if (!callback(tag.get(), "src", span.value())) { + return; + } + } + } + } + } + private: ~html_content() = default; }; diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 0957cfc021..3daa89edcf 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -156,8 +156,10 @@ struct html_component_name : html_component_base { struct html_component_href : html_component_base { std::string_view value; - explicit constexpr html_component_href(std::string_view v) - : value(v) + std::size_t offset = 0;// offset in decoded HTML buffer + std::size_t len = 0; // length of raw attribute value + explicit constexpr html_component_href(std::string_view v, std::size_t off = 0, std::size_t l = 0) + : value(v), offset(off), len(l) { } constexpr std::string_view get_string_value() const override @@ -990,8 +992,10 @@ struct html_component_title : html_component_base { struct html_component_src : html_component_base { std::string_view value; - explicit html_component_src(std::string_view v) - : value(v) + std::size_t offset = 0;// offset in decoded HTML buffer + std::size_t len = 0; // length of raw attribute value + explicit html_component_src(std::string_view v, std::size_t off = 0, std::size_t l = 0) + : value(v), offset(off), len(l) { } std::string_view get_string_value() const override @@ -1259,9 +1263,11 @@ using html_tag_component = std::variant< * Returns component variant from a string * @param name attribute name * @param value attribute value + * @param offset offset of attribute value in decoded HTML buffer (for URL rewriting) + * @param len length of attribute value in decoded HTML buffer (for URL rewriting) * @return variant component */ -auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component; +auto html_component_from_string(std::string_view name, std::string_view value, std::size_t offset = 0, std::size_t len = 0) -> html_tag_component; /* Public tags flags */ /* XML tag */ @@ -1288,6 +1294,12 @@ struct html_closing_tag { } }; +/* Attribute span in decoded HTML buffer (for URL rewriting) */ +struct attr_span { + std::size_t offset; + std::size_t len; +}; + struct html_tag { unsigned int tag_start = 0; unsigned int content_offset = 0; @@ -1384,6 +1396,25 @@ struct html_tag { return find_component().has_value(); } + auto get_attr_span(std::string_view attr_name) const -> std::optional + { + if (attr_name == "href") { + if (auto comp = find_component()) { + if (comp.value()->len > 0) { + return attr_span{comp.value()->offset, comp.value()->len}; + } + } + } + else if (attr_name == "src") { + if (auto comp = find_component()) { + if (comp.value()->len > 0) { + return attr_span{comp.value()->offset, comp.value()->len}; + } + } + } + return std::nullopt; + } + auto find_unknown_component(std::string_view attr_name) const -> std::optional { for (const auto &comp: components) { diff --git a/src/libserver/html/html_url_rewrite.cxx b/src/libserver/html/html_url_rewrite.cxx new file mode 100644 index 0000000000..b958ceeeac --- /dev/null +++ b/src/libserver/html/html_url_rewrite.cxx @@ -0,0 +1,338 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lua/lua_common.h" +#include "html_url_rewrite.hxx" +#include "html.hxx" +#include "html_tag.hxx" +#include "libserver/task.h" +#include "libserver/cfg_file.h" +#include "libserver/url.h" +#include "libmime/message.h" +#include "libutil/str_util.h" + +#include + +#define msg_debug_html_rewrite(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_task_log_id, "html_rewrite", task->task_pool->tag.uid, \ + __FUNCTION__, \ + __VA_ARGS__) + +namespace rspamd::html { + +/** + * Call Lua url_rewriter function to get replacement URL + * @param task Rspamd task + * @param func_name Lua function name (e.g., "url_rewriter") + * @param url Original URL string + * @return Replacement URL or empty optional if no replacement + */ +static auto call_lua_url_rewriter(struct rspamd_task *task, const char *func_name, const std::string &url) + -> std::optional +{ + if (!func_name || !task || !task->cfg) { + return std::nullopt; + } + + auto *L = RSPAMD_LUA_CFG_STATE(task->cfg); + if (!L) { + return std::nullopt; + } + + // Push error handler + lua_pushcfunction(L, &rspamd_lua_traceback); + auto err_idx = lua_gettop(L); + + // Get the function + if (!rspamd_lua_require_function(L, func_name, nullptr)) { + msg_debug_html_rewrite("cannot require function %s", func_name); + lua_settop(L, err_idx - 1); + return std::nullopt; + } + + // Push task + struct rspamd_task **ptask = (struct rspamd_task **) lua_newuserdata(L, sizeof(struct rspamd_task *)); + *ptask = task; + rspamd_lua_setclass(L, rspamd_task_classname, -1); + + // Push URL string + lua_pushlstring(L, url.c_str(), url.size()); + + // Call function with 2 args, 1 result + if (lua_pcall(L, 2, 1, err_idx) != 0) { + msg_warn_task("call to %s failed: %s", func_name, lua_tostring(L, -1)); + lua_settop(L, err_idx - 1); + return std::nullopt; + } + + // Check return value + std::optional result; + if (lua_type(L, -1) == LUA_TSTRING) { + std::size_t len; + const char *str = lua_tolstring(L, -1, &len); + if (str && len > 0) { + result = std::string{str, len}; + msg_debug_html_rewrite("URL rewrite: %s -> %s", url.c_str(), result->c_str()); + } + } + else if (!lua_isnil(L, -1)) { + msg_warn_task("%s returned non-string value", func_name); + } + + lua_settop(L, err_idx - 1); + return result; +} + +auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *task, int part_id) + -> std::vector +{ + std::vector candidates; + + if (!hc) { + return candidates; + } + + // Enumerate all clickable attributes with spans + hc->for_each_clickable_attr([&](const html_tag *tag, std::string_view attr_name, const attr_span &span) -> bool { + // Get the href or src value + std::string_view url_value; + if (attr_name == "href") { + if (auto href = tag->find_href()) { + url_value = href.value(); + } + } + else if (attr_name == "src") { + if (auto src_comp = tag->find_component()) { + url_value = src_comp.value()->value; + } + } + + if (url_value.empty()) { + return true;// Continue to next + } + + // Skip data: and cid: schemes by default + if (url_value.size() >= 5) { + if (url_value.substr(0, 5) == "data:" || url_value.substr(0, 4) == "cid:") { + return true;// Continue to next + } + } + + // Build absolute URL (already done by parser, but we have it in url_value) + // For now, just use url_value as-is. In real implementation, this should + // handle base URL resolution if needed. + std::string absolute_url{url_value}; + + // Create candidate + candidates.push_back(rewrite_candidate{tag, attr_name, std::move(absolute_url), span.offset, span.len, part_id}); + + return true;// Continue to next + }); + + return candidates; +} + +auto validate_patches(std::vector &patches) -> bool +{ + if (patches.empty()) { + return true; + } + + // Sort patches by part_id and offset + std::sort(patches.begin(), patches.end()); + + // Check for overlaps within same part + for (std::size_t i = 1; i < patches.size(); i++) { + const auto &prev = patches[i - 1]; + const auto &curr = patches[i]; + + // If same part, check for overlap + if (prev.part_id == curr.part_id) { + auto prev_end = prev.offset + prev.len; + if (prev_end > curr.offset) { + // Overlap detected + return false; + } + } + } + + return true; +} + +auto apply_patches(std::string_view original, const std::vector &patches) + -> std::string +{ + if (patches.empty()) { + return std::string{original}; + } + + std::string result; + result.reserve(original.size() + 1024);// Reserve extra space for potential growth + + std::size_t pos = 0; + + for (const auto &patch: patches) { + // Copy everything from pos to patch.offset + if (patch.offset > pos) { + result.append(original.substr(pos, patch.offset - pos)); + } + + // Apply the replacement + result.append(patch.replacement); + + // Move position to after the patched region + pos = patch.offset + patch.len; + } + + // Copy remaining content + if (pos < original.size()) { + result.append(original.substr(pos)); + } + + return result; +} + +auto process_html_url_rewrite(struct rspamd_task *task, + const html_content *hc, + const char *func_name, + int part_id, + std::string_view original_html) + -> std::optional +{ + if (!task || !hc || !func_name) { + return std::nullopt; + } + + // Enumerate candidates + auto candidates = enumerate_rewrite_candidates(hc, task, part_id); + if (candidates.empty()) { + msg_debug_html_rewrite("no URL rewrite candidates found"); + return std::nullopt; + } + + msg_debug_html_rewrite("found %zu URL rewrite candidates", candidates.size()); + + // Build patches by calling Lua for each candidate + std::vector patches; + patches.reserve(candidates.size()); + + for (const auto &candidate: candidates) { + // Call Lua callback + auto replacement = call_lua_url_rewriter(task, func_name, candidate.absolute_url); + if (!replacement) { + continue;// Skip if Lua returned nil + } + + // Create patch + patches.push_back(rewrite_patch{ + candidate.part_id, + candidate.offset, + candidate.len, + std::move(replacement.value())}); + } + + if (patches.empty()) { + msg_debug_html_rewrite("no patches generated from Lua callbacks"); + return std::nullopt; + } + + // Validate and sort patches + if (!validate_patches(patches)) { + msg_warn_task("URL rewrite patches overlap, skipping rewrite"); + return std::nullopt; + } + + msg_debug_html_rewrite("applying %zu patches", patches.size()); + + // Apply patches + return apply_patches(original_html, patches); +} + +auto reencode_html_content(std::string_view decoded_html, + int cte_type, + int fold_limit) + -> std::optional +{ + if (decoded_html.empty()) { + return std::nullopt; + } + + auto cte = static_cast(cte_type); + + switch (cte) { + case RSPAMD_CTE_7BIT: + case RSPAMD_CTE_8BIT: + // No encoding needed, return as-is + return std::string{decoded_html}; + + case RSPAMD_CTE_QP: { + // Encode using quoted-printable with CRLF line endings (MIME standard) + if (fold_limit > 0) { + char *encoded = rspamd_encode_qp_fold( + reinterpret_cast(decoded_html.data()), + decoded_html.size(), + fold_limit, + nullptr, + RSPAMD_TASK_NEWLINES_CRLF); + if (encoded) { + std::string result{encoded}; + g_free(encoded); + return result; + } + } + return std::nullopt; + } + + case RSPAMD_CTE_B64: { + // Encode using base64 with CRLF line endings (MIME standard) + char *encoded = nullptr; + if (fold_limit > 0) { + encoded = rspamd_encode_base64_fold( + reinterpret_cast(decoded_html.data()), + decoded_html.size(), + fold_limit, + nullptr, + RSPAMD_TASK_NEWLINES_CRLF); + } + else { + // No folding + encoded = rspamd_encode_base64( + reinterpret_cast(decoded_html.data()), + decoded_html.size(), + -1, + nullptr); + } + + if (encoded) { + std::string result{encoded}; + g_free(encoded); + return result; + } + return std::nullopt; + } + + case RSPAMD_CTE_UUE: + // UUE encoding not supported for rewriting + return std::nullopt; + + case RSPAMD_CTE_UNKNOWN: + default: + // Unknown encoding, return decoded content + return std::string{decoded_html}; + } +} + +}// namespace rspamd::html diff --git a/src/libserver/html/html_url_rewrite.hxx b/src/libserver/html/html_url_rewrite.hxx new file mode 100644 index 0000000000..ed6ffce89a --- /dev/null +++ b/src/libserver/html/html_url_rewrite.hxx @@ -0,0 +1,123 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_URL_REWRITE_HXX +#define RSPAMD_HTML_URL_REWRITE_HXX +#pragma once + +#include +#include +#include +#include + +struct rspamd_task; + +namespace rspamd::html { + +struct html_content; +struct html_tag; + +/** + * Candidate for URL rewriting + * Represents a single href/src attribute that may be rewritten + */ +struct rewrite_candidate { + const html_tag *tag; // Tag containing the attribute + std::string_view attr_name;// "href" or "src" + std::string absolute_url; // Absolute/canonicalized URL for Lua policy + std::size_t offset; // Offset of attribute value in decoded HTML buffer + std::size_t len; // Length of attribute value in decoded HTML buffer + int part_id; // MIME part ID (for multi-part messages) +}; + +/** + * Patch to apply to the decoded HTML buffer + * Represents a single replacement operation + */ +struct rewrite_patch { + int part_id; // MIME part ID + std::size_t offset; // Offset in decoded buffer + std::size_t len; // Length to replace + std::string replacement;// Replacement string + + // For sorting patches by offset + bool operator<(const rewrite_patch &other) const + { + if (part_id != other.part_id) { + return part_id < other.part_id; + } + return offset < other.offset; + } +}; + +/** + * Enumerate rewrite candidates from parsed HTML content + * @param hc HTML content structure + * @param task Rspamd task + * @param part_id MIME part ID + * @return vector of rewrite candidates + */ +auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *task, int part_id) + -> std::vector; + +/** + * Validate and sort patches to ensure no overlaps + * @param patches vector of patches to validate + * @return true if valid (no overlaps), false otherwise + */ +auto validate_patches(std::vector &patches) -> bool; + +/** + * Apply patches to a decoded HTML buffer + * @param original original decoded buffer + * @param patches sorted, non-overlapping patches + * @return rewritten buffer + */ +auto apply_patches(std::string_view original, const std::vector &patches) + -> std::string; + +/** + * Process HTML URL rewriting for a task + * Enumerates candidates, calls Lua callback, applies patches, and returns rewritten HTML + * @param task Rspamd task + * @param hc HTML content + * @param func_name Lua function name for URL rewriting + * @param part_id MIME part ID + * @param original_html Original HTML content (decoded) + * @return Rewritten HTML or nullopt if no changes + */ +auto process_html_url_rewrite(struct rspamd_task *task, + const html_content *hc, + const char *func_name, + int part_id, + std::string_view original_html) + -> std::optional; + +/** + * Re-encode HTML content using MIME transfer encoding + * @param decoded_html Decoded HTML content (after URL rewriting) + * @param cte Content Transfer Encoding type (from rspamd_mime_part) + * @param fold_limit Line length limit for quoted-printable and base64 (0 = no folding) + * @return Encoded content or nullopt on error + */ +auto reencode_html_content(std::string_view decoded_html, + int cte_type, + int fold_limit = 76) + -> std::optional; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_URL_REWRITE_HXX -- 2.47.3