[Project] Add ability to create custom tokenizers for languages

author Vsevolod Stakhov <vsevolod@rspamd.com>

Thu, 12 Jun 2025 09:08:45 +0000 (10:08 +0100)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Thu, 12 Jun 2025 09:08:45 +0000 (10:08 +0100)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Thu, 12 Jun 2025 09:08:45 +0000 (10:08 +0100)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Thu, 12 Jun 2025 09:08:45 +0000 (10:08 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index f2cabf399ffd4f6134e5f1cca1d4125b57f12f57..60894d879afc3f53e5cc3c584f1e4b8819fc0402 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1,5 +1,5 @@
  /*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
  #include "contrib/uthash/utlist.h"
  #include "contrib/t1ha/t1ha.h"
  #include "received.h"
+#include "libstat/tokenizers/custom_tokenizer.h"
  
  #define GTUBE_SYMBOL "GTUBE"
  
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h

index 2d0797c98613910fc0f79e697159d7edb3a2f6c3..362ddc0aeff719ab57f2e6e59ee50ca34ccac5f6 100644 (file)
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -497,9 +497,10 @@ struct rspamd_config {
         char *zstd_output_dictionary; /**< path to zstd output dictionary                                               */
         ucl_object_t *neighbours;     /**< other servers in the cluster                                         */
  
-       struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids                                                  */
-       struct rspamd_lang_detector *lang_det;          /**< language detector                                                                  */
-       struct rspamd_worker *cur_worker;               /**< set dynamically by each worker                                                     */
+       struct rspamd_config_settings_elt *setting_ids;     /**< preprocessed settings ids                                                      */
+       struct rspamd_lang_detector *lang_det;              /**< language detector                                                                      */
+       struct rspamd_tokenizer_manager *tokenizer_manager; /**< custom tokenizer manager                                               */
+       struct rspamd_worker *cur_worker;                   /**< set dynamically by each worker                                                 */
  
         ref_entry_t ref; /**< reference counter                                                                 */
  };
diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx

index b430a5fcab56052d7791385027056fa832c551b6..badcf6c5479a1198e478136f8b1e9da50aa351b6 100644 (file)
--- a/src/libserver/cfg_utils.cxx
+++ b/src/libserver/cfg_utils.cxx
@@ -72,6 +72,10 @@
  #include "contrib/expected/expected.hpp"
  #include "contrib/ankerl/unordered_dense.h"
  
+#include "libserver/task.h"
+#include "libserver/url.h"
+#include "libstat/tokenizers/custom_tokenizer.h"
+
  #define DEFAULT_SCORE 10.0
  
  #define DEFAULT_RLIMIT_NOFILE 2048
@@ -940,6 +944,37 @@ rspamd_config_post_load(struct rspamd_config *cfg,
                         msg_err_config("cannot configure libraries, fatal error");
                         return FALSE;
                 }
+
+               /* Load custom tokenizers */
+               const ucl_object_t *custom_tokenizers = ucl_object_lookup_path(cfg->cfg_ucl_obj,
+                                                                                                                                          "options.custom_tokenizers");
+               if (custom_tokenizers != NULL) {
+                       msg_info_config("loading custom tokenizers");
+                       cfg->tokenizer_manager = rspamd_tokenizer_manager_new(cfg->cfg_pool);
+
+                       ucl_object_iter_t it = ucl_object_iterate_new(custom_tokenizers);
+                       const ucl_object_t *tok_obj;
+                       const char *tok_name;
+
+                       while ((tok_obj = ucl_object_iterate_safe(it, true)) != NULL) {
+                               tok_name = ucl_object_key(tok_obj);
+                               GError *err = NULL;
+
+                               if (!rspamd_tokenizer_manager_load_tokenizer(cfg->tokenizer_manager,
+                                                                                                                        tok_name, tok_obj, &err)) {
+                                       msg_err_config("failed to load custom tokenizer '%s': %s",
+                                                                  tok_name, err ? err->message : "unknown error");
+                                       if (err) {
+                                               g_error_free(err);
+                                       }
+
+                                       if (opts & RSPAMD_CONFIG_INIT_VALIDATE) {
+                                               ret = tl::make_unexpected(fmt::format("failed to load custom tokenizer '{}'", tok_name));
+                                       }
+                               }
+                       }
+                       ucl_object_iterate_free(it);
+               }
         }
  
         /* Validate cache */
diff --git a/src/libstat/CMakeLists.txt b/src/libstat/CMakeLists.txt

index 64d572a57bbe93dead953105d5274a01c38ca301..eddf64e490c5888c7c291578e2c2f5ea2ea0f51d 100644 (file)
--- a/src/libstat/CMakeLists.txt
+++ b/src/libstat/CMakeLists.txt
@@ -1,25 +1,26 @@
  # Librspamdserver
-SET(LIBSTATSRC         ${CMAKE_CURRENT_SOURCE_DIR}/stat_config.c
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/stat_process.c)
+SET(LIBSTATSRC ${CMAKE_CURRENT_SOURCE_DIR}/stat_config.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/stat_process.c)
  
-SET(TOKENIZERSSRC      ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizers.c
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/osb.c)
+SET(TOKENIZERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizers.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/tokenizer_manager.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/tokenizers/osb.c)
  
-SET(CLASSIFIERSSRC     ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/lua_classifier.c)
+SET(CLASSIFIERSSRC ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/bayes.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/classifiers/lua_classifier.c)
  
-SET(BACKENDSSRC        ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/backends/http_backend.cxx
-                                       ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.cxx)
+SET(BACKENDSSRC ${CMAKE_CURRENT_SOURCE_DIR}/backends/mmaped_file.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/backends/sqlite3_backend.c
+        ${CMAKE_CURRENT_SOURCE_DIR}/backends/cdb_backend.cxx
+        ${CMAKE_CURRENT_SOURCE_DIR}/backends/http_backend.cxx
+        ${CMAKE_CURRENT_SOURCE_DIR}/backends/redis_backend.cxx)
  
-SET(CACHESSRC  ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c
+SET(CACHESSRC ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/sqlite3_cache.c
          ${CMAKE_CURRENT_SOURCE_DIR}/learn_cache/redis_cache.cxx)
  
  SET(RSPAMD_STAT ${LIBSTATSRC}
-                       ${TOKENIZERSSRC}
-                       ${CLASSIFIERSSRC}
-                       ${BACKENDSSRC}
-                       ${CACHESSRC} PARENT_SCOPE)
+        ${TOKENIZERSSRC}
+        ${CLASSIFIERSSRC}
+        ${BACKENDSSRC}
+        ${CACHESSRC} PARENT_SCOPE)
  
diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h

new file mode 100644 (file)

index 0000000..bacb4e7
--- /dev/null
+++ b/src/libstat/tokenizers/custom_tokenizer.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_CUSTOM_TOKENIZER_H
+#define RSPAMD_CUSTOM_TOKENIZER_H
+
+#include "config.h"
+#include "tokenizers.h"
+#include "ucl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
+
+/**
+ * Tokenization result - array of word positions as (start, length) pairs
+ * The array is terminated by a pair with both values set to 0
+ */
+struct rspamd_tokenizer_result {
+       unsigned int *positions; /* Array of (start, length) pairs */
+       size_t count;            /* Number of words (not array size!) */
+};
+
+/**
+ * Custom tokenizer API that must be implemented by language-specific tokenizer plugins
+ * All functions use only plain C types to ensure clean boundaries
+ */
+typedef struct rspamd_custom_tokenizer_api {
+       /* API version for compatibility checking */
+       unsigned int api_version;
+
+       /* Name of the tokenizer (e.g., "japanese_mecab") */
+       const char *name;
+
+       /**
+        * Global initialization function called once when the tokenizer is loaded
+        * @param config UCL configuration object for this tokenizer (may be NULL)
+        * @param error_buf Buffer for error message (at least 256 bytes)
+        * @return 0 on success, non-zero on failure
+        */
+       int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size);
+
+       /**
+        * Global cleanup function called when the tokenizer is unloaded
+        */
+       void (*deinit)(void);
+
+       /**
+        * Quick language detection to check if this tokenizer can handle the text
+        * @param text UTF-8 text to analyze
+        * @param len Length of the text in bytes
+        * @return Confidence score 0.0-1.0, or -1.0 if cannot handle
+        */
+       double (*detect_language)(const char *text, size_t len);
+
+       /**
+        * Main tokenization function
+        * @param text UTF-8 text to tokenize
+        * @param len Length of the text in bytes
+        * @param result Output structure to fill with word positions
+        * @return 0 on success, non-zero on failure
+        *
+        * The tokenizer should allocate result->positions using its own allocator
+        * Rspamd will call cleanup_result() to free it after processing
+        */
+       int (*tokenize)(const char *text, size_t len,
+                                       struct rspamd_tokenizer_result *result);
+
+       /**
+        * Cleanup the result from tokenize()
+        * @param result Result structure returned by tokenize()
+        *
+        * This function should free result->positions using the same allocator
+        * that was used in tokenize() and reset the structure fields.
+        * This ensures proper memory management across DLL boundaries.
+        * Note: This does NOT free the result structure itself, only its contents.
+        */
+       void (*cleanup_result)(struct rspamd_tokenizer_result *result);
+
+       /**
+        * Optional: Get language hint for better language detection
+        * @return Language code (e.g., "ja", "zh") or NULL
+        */
+       const char *(*get_language_hint)(void);
+
+       /**
+        * Optional: Get minimum confidence threshold for this tokenizer
+        * @return Minimum confidence (0.0-1.0) or -1.0 to use default
+        */
+       double (*get_min_confidence)(void);
+
+} rspamd_custom_tokenizer_api_t;
+
+/**
+ * Entry point function that plugins must export
+ * Must be named "rspamd_tokenizer_get_api"
+ */
+typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void);
+
+/* Internal Rspamd structures - not exposed to plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+
+/**
+ * Custom tokenizer instance
+ */
+struct rspamd_custom_tokenizer {
+       char *name;                               /* Tokenizer name from config */
+       char *path;                               /* Path to .so file */
+       void *handle;                             /* dlopen handle */
+       const rspamd_custom_tokenizer_api_t *api; /* API functions */
+       double priority;                          /* Detection priority */
+       double min_confidence;                    /* Minimum confidence threshold */
+       gboolean enabled;                         /* Is tokenizer enabled */
+       ucl_object_t *config;                     /* Tokenizer-specific config */
+};
+
+/**
+ * Tokenizer manager structure
+ */
+struct rspamd_tokenizer_manager {
+       GHashTable *tokenizers;  /* name -> rspamd_custom_tokenizer */
+       GArray *detection_order; /* Ordered by priority */
+       rspamd_mempool_t *pool;
+       double default_threshold; /* Default confidence threshold */
+};
+
+/* Manager functions */
+struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool);
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr);
+
+gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+                                                                                                const char *name,
+                                                                                                const ucl_object_t *config,
+                                                                                                GError **err);
+
+struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
+       struct rspamd_tokenizer_manager *mgr,
+       const char *text, size_t len,
+       double *confidence,
+       const char *lang_hint,
+       const char **detected_lang_hint);
+
+/* Helper function to tokenize with exceptions handling */
+GArray *rspamd_custom_tokenizer_tokenize_with_exceptions(
+       struct rspamd_custom_tokenizer *tokenizer,
+       const char *text,
+       gsize len,
+       GList *exceptions,
+       rspamd_mempool_t *pool);
+
+#endif /* RSPAMD_TOKENIZER_INTERNAL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_CUSTOM_TOKENIZER_H */
diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c

new file mode 100644 (file)

index 0000000..cd18a5f
--- /dev/null
+++ b/src/libstat/tokenizers/tokenizer_manager.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
+#include "libutil/util.h"
+#include "libserver/logger.h"
+#include <dlfcn.h>
+
+#define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
+                                                                                                                  "tokenizer", "",      \
+                                                                                                                  RSPAMD_LOG_FUNC,      \
+                                                                                                                  __VA_ARGS__)
+#define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
+                                                                                                                       "tokenizer", "",     \
+                                                                                                                       RSPAMD_LOG_FUNC,     \
+                                                                                                                       __VA_ARGS__)
+#define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
+                                                                                                                       "tokenizer", "",  \
+                                                                                                                       RSPAMD_LOG_FUNC,  \
+                                                                                                                       __VA_ARGS__)
+#define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL,                               \
+                                                                                                                          rspamd_tokenizer_log_id, "tokenizer", "", \
+                                                                                                                          RSPAMD_LOG_FUNC,                          \
+                                                                                                                          __VA_ARGS__)
+
+INIT_LOG_MODULE(tokenizer)
+
+static void
+rspamd_custom_tokenizer_dtor(gpointer p)
+{
+       struct rspamd_custom_tokenizer *tok = p;
+
+       if (tok) {
+               if (tok->api && tok->api->deinit) {
+                       tok->api->deinit();
+               }
+
+               if (tok->handle) {
+                       dlclose(tok->handle);
+               }
+
+               if (tok->config) {
+                       ucl_object_unref(tok->config);
+               }
+
+               g_free(tok->name);
+               g_free(tok->path);
+               g_free(tok);
+       }
+}
+
+static int
+rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b)
+{
+       const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a;
+       const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b;
+
+       /* Higher priority first */
+       if (t1->priority > t2->priority) {
+               return -1;
+       }
+       else if (t1->priority < t2->priority) {
+               return 1;
+       }
+
+       return 0;
+}
+
+struct rspamd_tokenizer_manager *
+rspamd_tokenizer_manager_new(rspamd_mempool_t *pool)
+{
+       struct rspamd_tokenizer_manager *mgr;
+
+       mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr));
+       mgr->pool = pool;
+       mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash,
+                                                                                       rspamd_strcase_equal,
+                                                                                       NULL,
+                                                                                       rspamd_custom_tokenizer_dtor);
+       mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *));
+       mgr->default_threshold = 0.7; /* Default confidence threshold */
+
+       rspamd_mempool_add_destructor(pool,
+                                                                 (rspamd_mempool_destruct_t) g_hash_table_unref,
+                                                                 mgr->tokenizers);
+       rspamd_mempool_add_destructor(pool,
+                                                                 (rspamd_mempool_destruct_t) g_array_free,
+                                                                 mgr->detection_order);
+
+       return mgr;
+}
+
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr)
+{
+       /* Cleanup is handled by memory pool destructors */
+}
+
+gboolean
+rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+                                                                               const char *name,
+                                                                               const ucl_object_t *config,
+                                                                               GError **err)
+{
+       struct rspamd_custom_tokenizer *tok;
+       const ucl_object_t *elt;
+       rspamd_tokenizer_get_api_func get_api;
+       const rspamd_custom_tokenizer_api_t *api;
+       void *handle;
+       const char *path;
+       gboolean enabled = TRUE;
+       double priority = 50.0;
+       char error_buf[256];
+
+       g_assert(mgr != NULL);
+       g_assert(name != NULL);
+       g_assert(config != NULL);
+
+       /* Check if enabled */
+       elt = ucl_object_lookup(config, "enabled");
+       if (elt && ucl_object_type(elt) == UCL_BOOLEAN) {
+               enabled = ucl_object_toboolean(elt);
+       }
+
+       if (!enabled) {
+               msg_info_tokenizer("custom tokenizer %s is disabled", name);
+               return TRUE;
+       }
+
+       /* Get path */
+       elt = ucl_object_lookup(config, "path");
+       if (!elt || ucl_object_type(elt) != UCL_STRING) {
+               g_set_error(err, g_quark_from_static_string("tokenizer"),
+                                       EINVAL, "missing 'path' for tokenizer %s", name);
+               return FALSE;
+       }
+       path = ucl_object_tostring(elt);
+
+       /* Get priority */
+       elt = ucl_object_lookup(config, "priority");
+       if (elt) {
+               priority = ucl_object_todouble(elt);
+       }
+
+       /* Load the shared library */
+       handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+       if (!handle) {
+               g_set_error(err, g_quark_from_static_string("tokenizer"),
+                                       EINVAL, "cannot load tokenizer %s from %s: %s",
+                                       name, path, dlerror());
+               return FALSE;
+       }
+
+       /* Get the API entry point */
+       get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api");
+       if (!get_api) {
+               dlclose(handle);
+               g_set_error(err, g_quark_from_static_string("tokenizer"),
+                                       EINVAL, "cannot find entry point in %s: %s",
+                                       path, dlerror());
+               return FALSE;
+       }
+
+       /* Get the API */
+       api = get_api();
+       if (!api) {
+               dlclose(handle);
+               g_set_error(err, g_quark_from_static_string("tokenizer"),
+                                       EINVAL, "tokenizer %s returned NULL API", name);
+               return FALSE;
+       }
+
+       /* Check API version */
+       if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) {
+               dlclose(handle);
+               g_set_error(err, g_quark_from_static_string("tokenizer"),
+                                       EINVAL, "tokenizer %s has incompatible API version %u (expected %u)",
+                                       name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION);
+               return FALSE;
+       }
+
+       /* Create tokenizer instance */
+       tok = g_malloc0(sizeof(*tok));
+       tok->name = g_strdup(name);
+       tok->path = g_strdup(path);
+       tok->handle = handle;
+       tok->api = api;
+       tok->priority = priority;
+       tok->enabled = enabled;
+
+       /* Get tokenizer config */
+       elt = ucl_object_lookup(config, "config");
+       if (elt) {
+               tok->config = ucl_object_ref(elt);
+       }
+
+       /* Get minimum confidence */
+       if (api->get_min_confidence) {
+               tok->min_confidence = api->get_min_confidence();
+       }
+       else {
+               tok->min_confidence = mgr->default_threshold;
+       }
+
+       /* Initialize the tokenizer */
+       if (api->init) {
+               error_buf[0] = '\0';
+               if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) {
+                       g_set_error(err, g_quark_from_static_string("tokenizer"),
+                                               EINVAL, "failed to initialize tokenizer %s: %s",
+                                               name, error_buf[0] ? error_buf : "unknown error");
+                       rspamd_custom_tokenizer_dtor(tok);
+                       return FALSE;
+               }
+       }
+
+       /* Add to manager */
+       g_hash_table_insert(mgr->tokenizers, tok->name, tok);
+       g_array_append_val(mgr->detection_order, tok);
+
+       /* Re-sort by priority */
+       g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp);
+
+       msg_info_tokenizer("loaded custom tokenizer %s (priority %.0f) from %s",
+                                          name, priority, path);
+
+       return TRUE;
+}
+
+struct rspamd_custom_tokenizer *
+rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
+                                                               const char *text, size_t len,
+                                                               double *confidence,
+                                                               const char *lang_hint,
+                                                               const char **detected_lang_hint)
+{
+       struct rspamd_custom_tokenizer *tok, *best_tok = NULL;
+       double conf, best_conf = 0.0;
+       unsigned int i;
+
+       g_assert(mgr != NULL);
+       g_assert(text != NULL);
+
+       if (confidence) {
+               *confidence = 0.0;
+       }
+
+       if (detected_lang_hint) {
+               *detected_lang_hint = NULL;
+       }
+
+       /* If we have a language hint, try to find a tokenizer for that language first */
+       if (lang_hint) {
+               for (i = 0; i < mgr->detection_order->len; i++) {
+                       tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+                       if (!tok->enabled || !tok->api->get_language_hint) {
+                               continue;
+                       }
+
+                       /* Check if this tokenizer handles the hinted language */
+                       const char *tok_lang = tok->api->get_language_hint();
+                       if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) {
+                               /* Found a tokenizer for this language, check if it actually detects it */
+                               if (tok->api->detect_language) {
+                                       conf = tok->api->detect_language(text, len);
+                                       if (conf >= tok->min_confidence) {
+                                               /* Use this tokenizer */
+                                               if (confidence) {
+                                                       *confidence = conf;
+                                               }
+                                               if (detected_lang_hint) {
+                                                       *detected_lang_hint = tok_lang;
+                                               }
+                                               return tok;
+                                       }
+                               }
+                       }
+               }
+       }
+
+       /* Try each tokenizer in priority order */
+       for (i = 0; i < mgr->detection_order->len; i++) {
+               tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+               if (!tok->enabled || !tok->api->detect_language) {
+                       continue;
+               }
+
+               conf = tok->api->detect_language(text, len);
+
+               if (conf > best_conf && conf >= tok->min_confidence) {
+                       best_conf = conf;
+                       best_tok = tok;
+
+                       /* Early exit if very confident */
+                       if (conf >= 0.95) {
+                               break;
+                       }
+               }
+       }
+
+       if (confidence && best_tok) {
+               *confidence = best_conf;
+       }
+
+       if (detected_lang_hint && best_tok && best_tok->api->get_language_hint) {
+               *detected_lang_hint = best_tok->api->get_language_hint();
+       }
+
+       return best_tok;
+}
+
+/* Helper function to tokenize with a custom tokenizer handling exceptions */
+GArray *
+rspamd_custom_tokenizer_tokenize_with_exceptions(
+       struct rspamd_custom_tokenizer *tokenizer,
+       const char *text,
+       gsize len,
+       GList *exceptions,
+       rspamd_mempool_t *pool)
+{
+       GArray *words;
+       struct rspamd_tokenizer_result result;
+       struct rspamd_process_exception *ex;
+       GList *cur_ex = exceptions;
+       gsize pos = 0;
+       unsigned int i;
+       int ret;
+
+       words = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), 128);
+
+       /* If no exceptions, tokenize the whole text */
+       if (!exceptions) {
+               result.positions = NULL;
+               result.count = 0;
+
+               ret = tokenizer->api->tokenize(text, len, &result);
+               if (ret == 0 && result.positions) {
+                       /* Convert positions to tokens */
+                       for (i = 0; i < result.count; i++) {
+                               rspamd_stat_token_t tok;
+                               unsigned int start = result.positions[i * 2];
+                               unsigned int length = result.positions[i * 2 + 1];
+
+                               if (start + length <= len) {
+                                       memset(&tok, 0, sizeof(tok));
+                                       tok.original.begin = text + start;
+                                       tok.original.len = length;
+                                       tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
+                                       g_array_append_val(words, tok);
+                               }
+                       }
+
+                       /* Use tokenizer's cleanup function */
+                       if (tokenizer->api->cleanup_result) {
+                               tokenizer->api->cleanup_result(&result);
+                       }
+               }
+
+               return words;
+       }
+
+       /* Process text with exceptions */
+       while (pos < len && cur_ex) {
+               ex = (struct rspamd_process_exception *) cur_ex->data;
+
+               /* Tokenize text before exception */
+               if (ex->pos > pos) {
+                       gsize segment_len = ex->pos - pos;
+                       result.positions = NULL;
+                       result.count = 0;
+
+                       ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
+                       if (ret == 0 && result.positions) {
+                               /* Convert positions to tokens, adjusting for segment offset */
+                               for (i = 0; i < result.count; i++) {
+                                       rspamd_stat_token_t tok;
+                                       unsigned int start = result.positions[i * 2] + pos;
+                                       unsigned int length = result.positions[i * 2 + 1];
+
+                                       if (start + length <= ex->pos) {
+                                               memset(&tok, 0, sizeof(tok));
+                                               tok.original.begin = text + start;
+                                               tok.original.len = length;
+                                               tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
+                                               g_array_append_val(words, tok);
+                                       }
+                               }
+
+                               /* Use tokenizer's cleanup function */
+                               if (tokenizer->api->cleanup_result) {
+                                       tokenizer->api->cleanup_result(&result);
+                               }
+                       }
+               }
+
+               /* Add exception as a special token */
+               rspamd_stat_token_t ex_tok;
+               memset(&ex_tok, 0, sizeof(ex_tok));
+
+               if (ex->type == RSPAMD_EXCEPTION_URL) {
+                       ex_tok.original.begin = "!!EX!!";
+                       ex_tok.original.len = 6;
+               }
+               else {
+                       ex_tok.original.begin = text + ex->pos;
+                       ex_tok.original.len = ex->len;
+               }
+               ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+               g_array_append_val(words, ex_tok);
+
+               /* Move past exception */
+               pos = ex->pos + ex->len;
+               cur_ex = g_list_next(cur_ex);
+       }
+
+       /* Process remaining text after last exception */
+       if (pos < len) {
+               result.positions = NULL;
+               result.count = 0;
+
+               ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
+               if (ret == 0 && result.positions) {
+                       /* Convert positions to tokens, adjusting for segment offset */
+                       for (i = 0; i < result.count; i++) {
+                               rspamd_stat_token_t tok;
+                               unsigned int start = result.positions[i * 2] + pos;
+                               unsigned int length = result.positions[i * 2 + 1];
+
+                               if (start + length <= len) {
+                                       memset(&tok, 0, sizeof(tok));
+                                       tok.original.begin = text + start;
+                                       tok.original.len = length;
+                                       tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
+                                       g_array_append_val(words, tok);
+                               }
+                       }
+
+                       /* Use tokenizer's cleanup function */
+                       if (tokenizer->api->cleanup_result) {
+                               tokenizer->api->cleanup_result(&result);
+                       }
+               }
+       }
+
+       return words;
+}
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 0ea1bcfc6f1898f4105e48e1b52c7a2852ce8de2..4667976fbe308bd5f42da5fed13ca0483b637cfd 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -1,5 +1,5 @@
  /*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
  #include "contrib/mumhash/mum.h"
  #include "libmime/lang_detection.h"
  #include "libstemmer.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
  
  #include <unicode/utf8.h>
  #include <unicode/uchar.h>
@@ -300,6 +302,9 @@ rspamd_tokenize_text(const char *text, gsize len,
         static const gsize long_text_limit = 1 * 1024 * 1024;
         static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
         ev_tstamp start;
+       struct rspamd_custom_tokenizer *custom_tok = NULL;
+       double custom_confidence = 0.0;
+       const char *detected_lang = NULL;
  
         if (text == NULL) {
                 return cur_words;
@@ -334,6 +339,54 @@ rspamd_tokenize_text(const char *text, gsize len,
                 res = cur_words;
         }
  
+       /* Try custom tokenizers first if we're in UTF mode */
+       if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) {
+               custom_tok = rspamd_tokenizer_manager_detect(
+                       cfg->tokenizer_manager,
+                       text, len,
+                       &custom_confidence,
+                       NULL, /* no input language hint */
+                       &detected_lang);
+
+               if (custom_tok && custom_confidence >= custom_tok->min_confidence) {
+                       /* Use custom tokenizer with exception handling */
+                       GArray *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
+                               custom_tok, text, len, exceptions, pool);
+
+                       if (custom_res) {
+                               msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization",
+                                                          custom_tok->name, custom_confidence);
+
+                               /* Calculate hash if needed */
+                               if (hash && custom_res->len > 0) {
+                                       unsigned int i;
+                                       for (i = 0; i < custom_res->len; i++) {
+                                               rspamd_stat_token_t *t = &g_array_index(custom_res, rspamd_stat_token_t, i);
+                                               if (t->original.len >= sizeof(uint64_t)) {
+                                                       uint64_t tmp;
+                                                       memcpy(&tmp, t->original.begin, sizeof(tmp));
+                                                       hv = mum_hash_step(hv, tmp);
+                                               }
+                                       }
+                                       *hash = mum_hash_finish(hv);
+                               }
+
+                               /* If we had existing words, append to them */
+                               if (cur_words && custom_res != cur_words) {
+                                       g_array_append_vals(cur_words, custom_res->data, custom_res->len);
+                                       g_array_free(custom_res, TRUE);
+                                       return cur_words;
+                               }
+
+                               return custom_res;
+                       }
+                       else {
+                               msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default",
+                                                         custom_tok->name);
+                       }
+               }
+       }
+
         if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
                 while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) {
                         if (l == 0 || (min_len > 0 && l < min_len) ||
@@ -952,4 +1005,4 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
                         }
                 }
         }
-}
-\ No newline at end of file
+}
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h

index d4a8824a8614970510bc2830e3dc5519bb977294..f3066b5cf1a98e91f78b542d45cccf323085402c 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -1,5 +1,5 @@
  /*
- * Copyright 2023 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c

index f77b369523a82afb66459a550a0198cb7b39c357..f1208abd2a4c786365bc19997f8d52d13d512de3 100644 (file)
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2020 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *    http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Thu, 12 Jun 2025 09:08:45 +0000 (10:08 +0100)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Thu, 12 Jun 2025 09:08:45 +0000 (10:08 +0100)
src/libmime/message.c		patch \| blob \| blame \| history
src/libserver/cfg_file.h		patch \| blob \| blame \| history
src/libserver/cfg_utils.cxx		patch \| blob \| blame \| history
src/libstat/CMakeLists.txt		patch \| blob \| blame \| history
src/libstat/tokenizers/custom_tokenizer.h	[new file with mode: 0644]	patch \| blob
src/libstat/tokenizers/tokenizer_manager.c	[new file with mode: 0644]	patch \| blob
src/libstat/tokenizers/tokenizers.c		patch \| blob \| blame \| history
src/libstat/tokenizers/tokenizers.h		patch \| blob \| blame \| history
src/lua/lua_parsers.c		patch \| blob \| blame \| history