From: Vsevolod Stakhov Date: Wed, 25 Feb 2026 14:51:07 +0000 (+0000) Subject: [Feature] Wire fasttext lang detector through maps infrastructure X-Git-Tag: 4.0.0~65^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=da0b9ca773ee62e0372fd682e7b944ec71cc46c1;p=thirdparty%2Frspamd.git [Feature] Wire fasttext lang detector through maps infrastructure The fasttext language detector now supports HTTP/HTTPS URLs for model loading via the maps system, enabling automatic download, disk caching, periodic reload, and cross-worker mmap sharing. Changes: - fasttext_model::load() accepts an offset parameter for mmap at a non-zero position (used with page-aligned map cache files) - fasttext_langdet uses rspamd_map_is_map() to detect URLs vs local paths; URLs go through rspamd_map_add() with RSPAMD_MAP_FILE_NO_READ - Map callbacks (read/fin/dtor) handle atomic model swap on reload - Local file paths continue to work as before with direct loading --- diff --git a/src/libmime/lang_detection_fasttext.cxx b/src/libmime/lang_detection_fasttext.cxx index c7dbe1ebbc..0b67140e85 100644 --- a/src/libmime/lang_detection_fasttext.cxx +++ b/src/libmime/lang_detection_fasttext.cxx @@ -18,6 +18,8 @@ #include "fasttext_shim.h" #include "libserver/cfg_file.h" #include "libserver/logger.h" +#include "libserver/maps/map.h" +#include "libserver/maps/map_private.h" #include "contrib/fmt/include/fmt/base.h" #include "stat_api.h" #include "libserver/word.h" @@ -33,13 +35,140 @@ EXTERN_LOG_MODULE_DEF(langdet); __VA_ARGS__) namespace rspamd::langdet { + +/** + * Map callback data for fasttext model loading. + * Used by the maps infrastructure to atomically swap old/new models. + */ +struct fasttext_map_data { + rspamd::fasttext::fasttext_model *model = nullptr; +}; + class fasttext_langdet { private: - std::optional model_; + /* Model pointer; for map-backed models this is managed via map callbacks */ + rspamd::fasttext::fasttext_model *model_ = nullptr; + /* Owned model for direct file loading (non-map case) */ + std::optional owned_model_; std::string model_fname; + struct rspamd_config *cfg_; + + void load_model_direct(const char *model_path) + { + auto *cfg = cfg_; + if (access(model_path, R_OK) != 0) { + msg_err_config("fasttext model '%s' is not readable: %s", + model_path, strerror(errno)); + return; + } + + auto result = rspamd::fasttext::fasttext_model::load(model_path); + if (result) { + owned_model_.emplace(std::move(*result)); + model_ = &owned_model_.value(); + model_fname = std::string{model_path}; + } + else { + msg_err_config("cannot load fasttext model '%s': %s", + model_path, result.error().error_message.data()); + } + } + + void load_model_map(const char *model_path) + { + auto *cfg = cfg_; + model_fname = std::string{model_path}; + + auto *map = rspamd_map_add(cfg_, model_path, + "fasttext language model", + fasttext_map_read_cb, + fasttext_map_fin_cb, + fasttext_map_dtor_cb, + reinterpret_cast(&model_), + nullptr, + RSPAMD_MAP_FILE_NO_READ); + + if (!map) { + msg_err_config("cannot add map for fasttext model '%s'", model_path); + } + } + + /* Map read callback: receives filename, loads model */ + static char *fasttext_map_read_cb(char *chunk, int len, + struct map_cb_data *data, gboolean final) + { + if (data->cur_data == nullptr) { + data->cur_data = new fasttext_map_data(); + } + + if (!final) { + return chunk + len; + } + + auto *fdata = static_cast(data->cur_data); + auto *map = data->map; + auto fname = std::string{chunk, static_cast(len)}; + auto offset = static_cast( + rspamd_map_get_no_file_read_offset(data->map)); + + auto result = rspamd::fasttext::fasttext_model::load(fname, offset); + if (result) { + fdata->model = new rspamd::fasttext::fasttext_model(std::move(*result)); + msg_info_map("loaded fasttext model from %s (offset %z)", + fname.c_str(), (gsize) offset); + } + else { + msg_err_map("cannot load fasttext model from %s (offset %z): %s", + fname.c_str(), (gsize) offset, + result.error().error_message.data()); + } + + return chunk + len; + } + + /* Map fin callback: swap old model for new one */ + static void fasttext_map_fin_cb(struct map_cb_data *data, void **target) + { + auto *new_data = static_cast(data->cur_data); + auto *old_data = static_cast(data->prev_data); + + if (data->errored) { + /* Clean up new data on error */ + if (new_data) { + delete new_data->model; + delete new_data; + data->cur_data = nullptr; + } + return; + } + + if (new_data && new_data->model) { + /* Publish new model pointer to consumer */ + if (target) { + *target = new_data->model; + } + } + + /* Destroy old model */ + if (old_data) { + delete old_data->model; + delete old_data; + } + } + + /* Map destructor callback */ + static void fasttext_map_dtor_cb(struct map_cb_data *data) + { + auto *fdata = static_cast(data->cur_data); + if (fdata) { + delete fdata->model; + delete fdata; + } + } public: explicit fasttext_langdet(struct rspamd_config *cfg) + : cfg_(cfg) { const auto *ucl_obj = cfg->cfg_ucl_obj; const auto *opts_section = ucl_object_find_key(ucl_obj, "lang_detection"); @@ -50,20 +179,11 @@ public: if (model) { const char *model_path = ucl_object_tostring(model); - if (access(model_path, R_OK) != 0) { - msg_err_config("fasttext model '%s' is not readable: %s", - model_path, strerror(errno)); - return; - } - - auto result = rspamd::fasttext::fasttext_model::load(model_path); - if (result) { - model_.emplace(std::move(*result)); - model_fname = std::string{model_path}; + if (rspamd_map_is_map(model_path)) { + load_model_map(model_path); } else { - msg_err_config("cannot load fasttext model '%s': %s", - model_path, result.error().error_message.data()); + load_model_direct(model_path); } } } @@ -78,7 +198,7 @@ public: auto is_enabled() const -> bool { - return model_.has_value(); + return model_ != nullptr; } auto word2vec(const char *in, std::size_t len, std::vector &word_ngramms) const diff --git a/src/libserver/fasttext/fasttext_shim.cxx b/src/libserver/fasttext/fasttext_shim.cxx index b6a7b6da48..8902ab8b0c 100644 --- a/src/libserver/fasttext/fasttext_shim.cxx +++ b/src/libserver/fasttext/fasttext_shim.cxx @@ -971,17 +971,18 @@ fasttext_model::fasttext_model(fasttext_model &&other) noexcept = default; fasttext_model &fasttext_model::operator=(fasttext_model &&other) noexcept = default; -auto fasttext_model::load(const char *path) -> tl::expected +auto fasttext_model::load(const char *path, std::int64_t offset) -> tl::expected { - /* mmap the entire file */ + /* mmap the file (possibly at an offset for map cache files) */ auto mmap_result = rspamd::util::raii_mmaped_file::mmap_shared( - path, O_RDONLY, PROT_READ); + path, O_RDONLY, PROT_READ, offset); if (!mmap_result) { return tl::make_unexpected(mmap_result.error()); } - auto file_size = mmap_result->get_size(); + /* Use the mapped region size, not full file size (they differ when offset > 0) */ + auto file_size = mmap_result->get_size() - (std::size_t) offset; auto *base = static_cast(mmap_result->get_map()); binary_reader reader(base, file_size); diff --git a/src/libserver/fasttext/fasttext_shim.h b/src/libserver/fasttext/fasttext_shim.h index 45425ba18a..80b661dbf2 100644 --- a/src/libserver/fasttext/fasttext_shim.h +++ b/src/libserver/fasttext/fasttext_shim.h @@ -93,10 +93,10 @@ public: * @param path path to the model file * @return loaded model or error */ - static auto load(const char *path) -> tl::expected; - static auto load(const std::string &path) -> tl::expected + static auto load(const char *path, std::int64_t offset = 0) -> tl::expected; + static auto load(const std::string &path, std::int64_t offset = 0) -> tl::expected { - return load(path.c_str()); + return load(path.c_str(), offset); } /**