]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Wire fasttext lang detector through maps infrastructure
authorVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 25 Feb 2026 14:51:07 +0000 (14:51 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 25 Feb 2026 14:51:07 +0000 (14:51 +0000)
The fasttext language detector now supports HTTP/HTTPS URLs for model
loading via the maps system, enabling automatic download, disk caching,
periodic reload, and cross-worker mmap sharing.

Changes:
- fasttext_model::load() accepts an offset parameter for mmap at a
  non-zero position (used with page-aligned map cache files)
- fasttext_langdet uses rspamd_map_is_map() to detect URLs vs local
  paths; URLs go through rspamd_map_add() with RSPAMD_MAP_FILE_NO_READ
- Map callbacks (read/fin/dtor) handle atomic model swap on reload
- Local file paths continue to work as before with direct loading

src/libmime/lang_detection_fasttext.cxx
src/libserver/fasttext/fasttext_shim.cxx
src/libserver/fasttext/fasttext_shim.h

index c7dbe1ebbc1f674ba2d729cbf1abb5139d0f8761..0b67140e859433b76ea5ee4115e7cfd3048ce11b 100644 (file)
@@ -18,6 +18,8 @@
 #include "fasttext_shim.h"
 #include "libserver/cfg_file.h"
 #include "libserver/logger.h"
+#include "libserver/maps/map.h"
+#include "libserver/maps/map_private.h"
 #include "contrib/fmt/include/fmt/base.h"
 #include "stat_api.h"
 #include "libserver/word.h"
@@ -33,13 +35,140 @@ EXTERN_LOG_MODULE_DEF(langdet);
                                                                                                                          __VA_ARGS__)
 
 namespace rspamd::langdet {
+
+/**
+ * Map callback data for fasttext model loading.
+ * Used by the maps infrastructure to atomically swap old/new models.
+ */
+struct fasttext_map_data {
+       rspamd::fasttext::fasttext_model *model = nullptr;
+};
+
 class fasttext_langdet {
 private:
-       std::optional<rspamd::fasttext::fasttext_model> model_;
+       /* Model pointer; for map-backed models this is managed via map callbacks */
+       rspamd::fasttext::fasttext_model *model_ = nullptr;
+       /* Owned model for direct file loading (non-map case) */
+       std::optional<rspamd::fasttext::fasttext_model> owned_model_;
        std::string model_fname;
+       struct rspamd_config *cfg_;
+
+       void load_model_direct(const char *model_path)
+       {
+               auto *cfg = cfg_;
+               if (access(model_path, R_OK) != 0) {
+                       msg_err_config("fasttext model '%s' is not readable: %s",
+                                                  model_path, strerror(errno));
+                       return;
+               }
+
+               auto result = rspamd::fasttext::fasttext_model::load(model_path);
+               if (result) {
+                       owned_model_.emplace(std::move(*result));
+                       model_ = &owned_model_.value();
+                       model_fname = std::string{model_path};
+               }
+               else {
+                       msg_err_config("cannot load fasttext model '%s': %s",
+                                                  model_path, result.error().error_message.data());
+               }
+       }
+
+       void load_model_map(const char *model_path)
+       {
+               auto *cfg = cfg_;
+               model_fname = std::string{model_path};
+
+               auto *map = rspamd_map_add(cfg_, model_path,
+                                                                  "fasttext language model",
+                                                                  fasttext_map_read_cb,
+                                                                  fasttext_map_fin_cb,
+                                                                  fasttext_map_dtor_cb,
+                                                                  reinterpret_cast<void **>(&model_),
+                                                                  nullptr,
+                                                                  RSPAMD_MAP_FILE_NO_READ);
+
+               if (!map) {
+                       msg_err_config("cannot add map for fasttext model '%s'", model_path);
+               }
+       }
+
+       /* Map read callback: receives filename, loads model */
+       static char *fasttext_map_read_cb(char *chunk, int len,
+                                                                         struct map_cb_data *data, gboolean final)
+       {
+               if (data->cur_data == nullptr) {
+                       data->cur_data = new fasttext_map_data();
+               }
+
+               if (!final) {
+                       return chunk + len;
+               }
+
+               auto *fdata = static_cast<fasttext_map_data *>(data->cur_data);
+               auto *map = data->map;
+               auto fname = std::string{chunk, static_cast<std::size_t>(len)};
+               auto offset = static_cast<std::int64_t>(
+                       rspamd_map_get_no_file_read_offset(data->map));
+
+               auto result = rspamd::fasttext::fasttext_model::load(fname, offset);
+               if (result) {
+                       fdata->model = new rspamd::fasttext::fasttext_model(std::move(*result));
+                       msg_info_map("loaded fasttext model from %s (offset %z)",
+                                                fname.c_str(), (gsize) offset);
+               }
+               else {
+                       msg_err_map("cannot load fasttext model from %s (offset %z): %s",
+                                               fname.c_str(), (gsize) offset,
+                                               result.error().error_message.data());
+               }
+
+               return chunk + len;
+       }
+
+       /* Map fin callback: swap old model for new one */
+       static void fasttext_map_fin_cb(struct map_cb_data *data, void **target)
+       {
+               auto *new_data = static_cast<fasttext_map_data *>(data->cur_data);
+               auto *old_data = static_cast<fasttext_map_data *>(data->prev_data);
+
+               if (data->errored) {
+                       /* Clean up new data on error */
+                       if (new_data) {
+                               delete new_data->model;
+                               delete new_data;
+                               data->cur_data = nullptr;
+                       }
+                       return;
+               }
+
+               if (new_data && new_data->model) {
+                       /* Publish new model pointer to consumer */
+                       if (target) {
+                               *target = new_data->model;
+                       }
+               }
+
+               /* Destroy old model */
+               if (old_data) {
+                       delete old_data->model;
+                       delete old_data;
+               }
+       }
+
+       /* Map destructor callback */
+       static void fasttext_map_dtor_cb(struct map_cb_data *data)
+       {
+               auto *fdata = static_cast<fasttext_map_data *>(data->cur_data);
+               if (fdata) {
+                       delete fdata->model;
+                       delete fdata;
+               }
+       }
 
 public:
        explicit fasttext_langdet(struct rspamd_config *cfg)
+               : cfg_(cfg)
        {
                const auto *ucl_obj = cfg->cfg_ucl_obj;
                const auto *opts_section = ucl_object_find_key(ucl_obj, "lang_detection");
@@ -50,20 +179,11 @@ public:
                        if (model) {
                                const char *model_path = ucl_object_tostring(model);
 
-                               if (access(model_path, R_OK) != 0) {
-                                       msg_err_config("fasttext model '%s' is not readable: %s",
-                                                                  model_path, strerror(errno));
-                                       return;
-                               }
-
-                               auto result = rspamd::fasttext::fasttext_model::load(model_path);
-                               if (result) {
-                                       model_.emplace(std::move(*result));
-                                       model_fname = std::string{model_path};
+                               if (rspamd_map_is_map(model_path)) {
+                                       load_model_map(model_path);
                                }
                                else {
-                                       msg_err_config("cannot load fasttext model '%s': %s",
-                                                                  model_path, result.error().error_message.data());
+                                       load_model_direct(model_path);
                                }
                        }
                }
@@ -78,7 +198,7 @@ public:
 
        auto is_enabled() const -> bool
        {
-               return model_.has_value();
+               return model_ != nullptr;
        }
 
        auto word2vec(const char *in, std::size_t len, std::vector<std::int32_t> &word_ngramms) const
index b6a7b6da484d7d24672861da1fd09156d8a3439d..8902ab8b0c7bd3472ba04d208cd816556da702a8 100644 (file)
@@ -971,17 +971,18 @@ fasttext_model::fasttext_model(fasttext_model &&other) noexcept = default;
 
 fasttext_model &fasttext_model::operator=(fasttext_model &&other) noexcept = default;
 
-auto fasttext_model::load(const char *path) -> tl::expected<fasttext_model, rspamd::util::error>
+auto fasttext_model::load(const char *path, std::int64_t offset) -> tl::expected<fasttext_model, rspamd::util::error>
 {
-       /* mmap the entire file */
+       /* mmap the file (possibly at an offset for map cache files) */
        auto mmap_result = rspamd::util::raii_mmaped_file::mmap_shared(
-               path, O_RDONLY, PROT_READ);
+               path, O_RDONLY, PROT_READ, offset);
 
        if (!mmap_result) {
                return tl::make_unexpected(mmap_result.error());
        }
 
-       auto file_size = mmap_result->get_size();
+       /* Use the mapped region size, not full file size (they differ when offset > 0) */
+       auto file_size = mmap_result->get_size() - (std::size_t) offset;
        auto *base = static_cast<const unsigned char *>(mmap_result->get_map());
 
        binary_reader reader(base, file_size);
index 45425ba18aa599ffc7dd9e3cd0bf8a8e85d4abeb..80b661dbf2e6956eb7c423274865bbce483dfe5a 100644 (file)
@@ -93,10 +93,10 @@ public:
         * @param path path to the model file
         * @return loaded model or error
         */
-       static auto load(const char *path) -> tl::expected<fasttext_model, rspamd::util::error>;
-       static auto load(const std::string &path) -> tl::expected<fasttext_model, rspamd::util::error>
+       static auto load(const char *path, std::int64_t offset = 0) -> tl::expected<fasttext_model, rspamd::util::error>;
+       static auto load(const std::string &path, std::int64_t offset = 0) -> tl::expected<fasttext_model, rspamd::util::error>
        {
-               return load(path.c_str());
+               return load(path.c_str(), offset);
        }
 
        /**