--- /dev/null
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_CUSTOM_TOKENIZER_H
+#define RSPAMD_CUSTOM_TOKENIZER_H
+
+#include "config.h"
+#include "tokenizers.h"
+#include "ucl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
+
+/**
+ * Tokenization result - array of word positions as (start, length) pairs
+ * The array is terminated by a pair with both values set to 0
+ */
+struct rspamd_tokenizer_result {
+ unsigned int *positions; /* Array of (start, length) pairs */
+ size_t count; /* Number of words (not array size!) */
+};
+
+/**
+ * Custom tokenizer API that must be implemented by language-specific tokenizer plugins
+ * All functions use only plain C types to ensure clean boundaries
+ */
+typedef struct rspamd_custom_tokenizer_api {
+ /* API version for compatibility checking */
+ unsigned int api_version;
+
+ /* Name of the tokenizer (e.g., "japanese_mecab") */
+ const char *name;
+
+ /**
+ * Global initialization function called once when the tokenizer is loaded
+ * @param config UCL configuration object for this tokenizer (may be NULL)
+ * @param error_buf Buffer for error message (at least 256 bytes)
+ * @return 0 on success, non-zero on failure
+ */
+ int (*init)(const ucl_object_t *config, char *error_buf, size_t error_buf_size);
+
+ /**
+ * Global cleanup function called when the tokenizer is unloaded
+ */
+ void (*deinit)(void);
+
+ /**
+ * Quick language detection to check if this tokenizer can handle the text
+ * @param text UTF-8 text to analyze
+ * @param len Length of the text in bytes
+ * @return Confidence score 0.0-1.0, or -1.0 if cannot handle
+ */
+ double (*detect_language)(const char *text, size_t len);
+
+ /**
+ * Main tokenization function
+ * @param text UTF-8 text to tokenize
+ * @param len Length of the text in bytes
+ * @param result Output structure to fill with word positions
+ * @return 0 on success, non-zero on failure
+ *
+ * The tokenizer should allocate result->positions using its own allocator
+ * Rspamd will call cleanup_result() to free it after processing
+ */
+ int (*tokenize)(const char *text, size_t len,
+ struct rspamd_tokenizer_result *result);
+
+ /**
+ * Cleanup the result from tokenize()
+ * @param result Result structure returned by tokenize()
+ *
+ * This function should free result->positions using the same allocator
+ * that was used in tokenize() and reset the structure fields.
+ * This ensures proper memory management across DLL boundaries.
+ * Note: This does NOT free the result structure itself, only its contents.
+ */
+ void (*cleanup_result)(struct rspamd_tokenizer_result *result);
+
+ /**
+ * Optional: Get language hint for better language detection
+ * @return Language code (e.g., "ja", "zh") or NULL
+ */
+ const char *(*get_language_hint)(void);
+
+ /**
+ * Optional: Get minimum confidence threshold for this tokenizer
+ * @return Minimum confidence (0.0-1.0) or -1.0 to use default
+ */
+ double (*get_min_confidence)(void);
+
+} rspamd_custom_tokenizer_api_t;
+
+/**
+ * Entry point function that plugins must export
+ * Must be named "rspamd_tokenizer_get_api"
+ */
+typedef const rspamd_custom_tokenizer_api_t *(*rspamd_tokenizer_get_api_func)(void);
+
+/* Internal Rspamd structures - not exposed to plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+
+/**
+ * Custom tokenizer instance
+ */
+struct rspamd_custom_tokenizer {
+ char *name; /* Tokenizer name from config */
+ char *path; /* Path to .so file */
+ void *handle; /* dlopen handle */
+ const rspamd_custom_tokenizer_api_t *api; /* API functions */
+ double priority; /* Detection priority */
+ double min_confidence; /* Minimum confidence threshold */
+ gboolean enabled; /* Is tokenizer enabled */
+ ucl_object_t *config; /* Tokenizer-specific config */
+};
+
+/**
+ * Tokenizer manager structure
+ */
+struct rspamd_tokenizer_manager {
+ GHashTable *tokenizers; /* name -> rspamd_custom_tokenizer */
+ GArray *detection_order; /* Ordered by priority */
+ rspamd_mempool_t *pool;
+ double default_threshold; /* Default confidence threshold */
+};
+
+/* Manager functions */
+struct rspamd_tokenizer_manager *rspamd_tokenizer_manager_new(rspamd_mempool_t *pool);
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr);
+
+gboolean rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+ const char *name,
+ const ucl_object_t *config,
+ GError **err);
+
+struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
+ struct rspamd_tokenizer_manager *mgr,
+ const char *text, size_t len,
+ double *confidence,
+ const char *lang_hint,
+ const char **detected_lang_hint);
+
+/* Helper function to tokenize with exceptions handling */
+GArray *rspamd_custom_tokenizer_tokenize_with_exceptions(
+ struct rspamd_custom_tokenizer *tokenizer,
+ const char *text,
+ gsize len,
+ GList *exceptions,
+ rspamd_mempool_t *pool);
+
+#endif /* RSPAMD_TOKENIZER_INTERNAL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_CUSTOM_TOKENIZER_H */
--- /dev/null
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
+#include "libutil/util.h"
+#include "libserver/logger.h"
+#include <dlfcn.h>
+
+#define msg_err_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
+ "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_warn_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
+ "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_info_tokenizer(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
+ "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_debug_tokenizer(...) rspamd_conditional_debug_fast(NULL, NULL, \
+ rspamd_tokenizer_log_id, "tokenizer", "", \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+
+INIT_LOG_MODULE(tokenizer)
+
+static void
+rspamd_custom_tokenizer_dtor(gpointer p)
+{
+ struct rspamd_custom_tokenizer *tok = p;
+
+ if (tok) {
+ if (tok->api && tok->api->deinit) {
+ tok->api->deinit();
+ }
+
+ if (tok->handle) {
+ dlclose(tok->handle);
+ }
+
+ if (tok->config) {
+ ucl_object_unref(tok->config);
+ }
+
+ g_free(tok->name);
+ g_free(tok->path);
+ g_free(tok);
+ }
+}
+
+static int
+rspamd_custom_tokenizer_priority_cmp(gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_custom_tokenizer *t1 = *(const struct rspamd_custom_tokenizer **) a;
+ const struct rspamd_custom_tokenizer *t2 = *(const struct rspamd_custom_tokenizer **) b;
+
+ /* Higher priority first */
+ if (t1->priority > t2->priority) {
+ return -1;
+ }
+ else if (t1->priority < t2->priority) {
+ return 1;
+ }
+
+ return 0;
+}
+
+struct rspamd_tokenizer_manager *
+rspamd_tokenizer_manager_new(rspamd_mempool_t *pool)
+{
+ struct rspamd_tokenizer_manager *mgr;
+
+ mgr = rspamd_mempool_alloc0(pool, sizeof(*mgr));
+ mgr->pool = pool;
+ mgr->tokenizers = g_hash_table_new_full(rspamd_strcase_hash,
+ rspamd_strcase_equal,
+ NULL,
+ rspamd_custom_tokenizer_dtor);
+ mgr->detection_order = g_array_new(FALSE, FALSE, sizeof(struct rspamd_custom_tokenizer *));
+ mgr->default_threshold = 0.7; /* Default confidence threshold */
+
+ rspamd_mempool_add_destructor(pool,
+ (rspamd_mempool_destruct_t) g_hash_table_unref,
+ mgr->tokenizers);
+ rspamd_mempool_add_destructor(pool,
+ (rspamd_mempool_destruct_t) g_array_free,
+ mgr->detection_order);
+
+ return mgr;
+}
+
+void rspamd_tokenizer_manager_destroy(struct rspamd_tokenizer_manager *mgr)
+{
+ /* Cleanup is handled by memory pool destructors */
+}
+
+gboolean
+rspamd_tokenizer_manager_load_tokenizer(struct rspamd_tokenizer_manager *mgr,
+ const char *name,
+ const ucl_object_t *config,
+ GError **err)
+{
+ struct rspamd_custom_tokenizer *tok;
+ const ucl_object_t *elt;
+ rspamd_tokenizer_get_api_func get_api;
+ const rspamd_custom_tokenizer_api_t *api;
+ void *handle;
+ const char *path;
+ gboolean enabled = TRUE;
+ double priority = 50.0;
+ char error_buf[256];
+
+ g_assert(mgr != NULL);
+ g_assert(name != NULL);
+ g_assert(config != NULL);
+
+ /* Check if enabled */
+ elt = ucl_object_lookup(config, "enabled");
+ if (elt && ucl_object_type(elt) == UCL_BOOLEAN) {
+ enabled = ucl_object_toboolean(elt);
+ }
+
+ if (!enabled) {
+ msg_info_tokenizer("custom tokenizer %s is disabled", name);
+ return TRUE;
+ }
+
+ /* Get path */
+ elt = ucl_object_lookup(config, "path");
+ if (!elt || ucl_object_type(elt) != UCL_STRING) {
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "missing 'path' for tokenizer %s", name);
+ return FALSE;
+ }
+ path = ucl_object_tostring(elt);
+
+ /* Get priority */
+ elt = ucl_object_lookup(config, "priority");
+ if (elt) {
+ priority = ucl_object_todouble(elt);
+ }
+
+ /* Load the shared library */
+ handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+ if (!handle) {
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "cannot load tokenizer %s from %s: %s",
+ name, path, dlerror());
+ return FALSE;
+ }
+
+ /* Get the API entry point */
+ get_api = (rspamd_tokenizer_get_api_func) dlsym(handle, "rspamd_tokenizer_get_api");
+ if (!get_api) {
+ dlclose(handle);
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "cannot find entry point in %s: %s",
+ path, dlerror());
+ return FALSE;
+ }
+
+ /* Get the API */
+ api = get_api();
+ if (!api) {
+ dlclose(handle);
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "tokenizer %s returned NULL API", name);
+ return FALSE;
+ }
+
+ /* Check API version */
+ if (api->api_version != RSPAMD_CUSTOM_TOKENIZER_API_VERSION) {
+ dlclose(handle);
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "tokenizer %s has incompatible API version %u (expected %u)",
+ name, api->api_version, RSPAMD_CUSTOM_TOKENIZER_API_VERSION);
+ return FALSE;
+ }
+
+ /* Create tokenizer instance */
+ tok = g_malloc0(sizeof(*tok));
+ tok->name = g_strdup(name);
+ tok->path = g_strdup(path);
+ tok->handle = handle;
+ tok->api = api;
+ tok->priority = priority;
+ tok->enabled = enabled;
+
+ /* Get tokenizer config */
+ elt = ucl_object_lookup(config, "config");
+ if (elt) {
+ tok->config = ucl_object_ref(elt);
+ }
+
+ /* Get minimum confidence */
+ if (api->get_min_confidence) {
+ tok->min_confidence = api->get_min_confidence();
+ }
+ else {
+ tok->min_confidence = mgr->default_threshold;
+ }
+
+ /* Initialize the tokenizer */
+ if (api->init) {
+ error_buf[0] = '\0';
+ if (api->init(tok->config, error_buf, sizeof(error_buf)) != 0) {
+ g_set_error(err, g_quark_from_static_string("tokenizer"),
+ EINVAL, "failed to initialize tokenizer %s: %s",
+ name, error_buf[0] ? error_buf : "unknown error");
+ rspamd_custom_tokenizer_dtor(tok);
+ return FALSE;
+ }
+ }
+
+ /* Add to manager */
+ g_hash_table_insert(mgr->tokenizers, tok->name, tok);
+ g_array_append_val(mgr->detection_order, tok);
+
+ /* Re-sort by priority */
+ g_array_sort(mgr->detection_order, rspamd_custom_tokenizer_priority_cmp);
+
+ msg_info_tokenizer("loaded custom tokenizer %s (priority %.0f) from %s",
+ name, priority, path);
+
+ return TRUE;
+}
+
+struct rspamd_custom_tokenizer *
+rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
+ const char *text, size_t len,
+ double *confidence,
+ const char *lang_hint,
+ const char **detected_lang_hint)
+{
+ struct rspamd_custom_tokenizer *tok, *best_tok = NULL;
+ double conf, best_conf = 0.0;
+ unsigned int i;
+
+ g_assert(mgr != NULL);
+ g_assert(text != NULL);
+
+ if (confidence) {
+ *confidence = 0.0;
+ }
+
+ if (detected_lang_hint) {
+ *detected_lang_hint = NULL;
+ }
+
+ /* If we have a language hint, try to find a tokenizer for that language first */
+ if (lang_hint) {
+ for (i = 0; i < mgr->detection_order->len; i++) {
+ tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+ if (!tok->enabled || !tok->api->get_language_hint) {
+ continue;
+ }
+
+ /* Check if this tokenizer handles the hinted language */
+ const char *tok_lang = tok->api->get_language_hint();
+ if (tok_lang && g_ascii_strcasecmp(tok_lang, lang_hint) == 0) {
+ /* Found a tokenizer for this language, check if it actually detects it */
+ if (tok->api->detect_language) {
+ conf = tok->api->detect_language(text, len);
+ if (conf >= tok->min_confidence) {
+ /* Use this tokenizer */
+ if (confidence) {
+ *confidence = conf;
+ }
+ if (detected_lang_hint) {
+ *detected_lang_hint = tok_lang;
+ }
+ return tok;
+ }
+ }
+ }
+ }
+ }
+
+ /* Try each tokenizer in priority order */
+ for (i = 0; i < mgr->detection_order->len; i++) {
+ tok = g_array_index(mgr->detection_order, struct rspamd_custom_tokenizer *, i);
+
+ if (!tok->enabled || !tok->api->detect_language) {
+ continue;
+ }
+
+ conf = tok->api->detect_language(text, len);
+
+ if (conf > best_conf && conf >= tok->min_confidence) {
+ best_conf = conf;
+ best_tok = tok;
+
+ /* Early exit if very confident */
+ if (conf >= 0.95) {
+ break;
+ }
+ }
+ }
+
+ if (confidence && best_tok) {
+ *confidence = best_conf;
+ }
+
+ if (detected_lang_hint && best_tok && best_tok->api->get_language_hint) {
+ *detected_lang_hint = best_tok->api->get_language_hint();
+ }
+
+ return best_tok;
+}
+
+/* Helper function to tokenize with a custom tokenizer handling exceptions */
+GArray *
+rspamd_custom_tokenizer_tokenize_with_exceptions(
+ struct rspamd_custom_tokenizer *tokenizer,
+ const char *text,
+ gsize len,
+ GList *exceptions,
+ rspamd_mempool_t *pool)
+{
+ GArray *words;
+ struct rspamd_tokenizer_result result;
+ struct rspamd_process_exception *ex;
+ GList *cur_ex = exceptions;
+ gsize pos = 0;
+ unsigned int i;
+ int ret;
+
+ words = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), 128);
+
+ /* If no exceptions, tokenize the whole text */
+ if (!exceptions) {
+ result.positions = NULL;
+ result.count = 0;
+
+ ret = tokenizer->api->tokenize(text, len, &result);
+ if (ret == 0 && result.positions) {
+ /* Convert positions to tokens */
+ for (i = 0; i < result.count; i++) {
+ rspamd_stat_token_t tok;
+ unsigned int start = result.positions[i * 2];
+ unsigned int length = result.positions[i * 2 + 1];
+
+ if (start + length <= len) {
+ memset(&tok, 0, sizeof(tok));
+ tok.original.begin = text + start;
+ tok.original.len = length;
+ tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
+ g_array_append_val(words, tok);
+ }
+ }
+
+ /* Use tokenizer's cleanup function */
+ if (tokenizer->api->cleanup_result) {
+ tokenizer->api->cleanup_result(&result);
+ }
+ }
+
+ return words;
+ }
+
+ /* Process text with exceptions */
+ while (pos < len && cur_ex) {
+ ex = (struct rspamd_process_exception *) cur_ex->data;
+
+ /* Tokenize text before exception */
+ if (ex->pos > pos) {
+ gsize segment_len = ex->pos - pos;
+ result.positions = NULL;
+ result.count = 0;
+
+ ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
+ if (ret == 0 && result.positions) {
+ /* Convert positions to tokens, adjusting for segment offset */
+ for (i = 0; i < result.count; i++) {
+ rspamd_stat_token_t tok;
+ unsigned int start = result.positions[i * 2] + pos;
+ unsigned int length = result.positions[i * 2 + 1];
+
+ if (start + length <= ex->pos) {
+ memset(&tok, 0, sizeof(tok));
+ tok.original.begin = text + start;
+ tok.original.len = length;
+ tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
+ g_array_append_val(words, tok);
+ }
+ }
+
+ /* Use tokenizer's cleanup function */
+ if (tokenizer->api->cleanup_result) {
+ tokenizer->api->cleanup_result(&result);
+ }
+ }
+ }
+
+ /* Add exception as a special token */
+ rspamd_stat_token_t ex_tok;
+ memset(&ex_tok, 0, sizeof(ex_tok));
+
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ ex_tok.original.begin = "!!EX!!";
+ ex_tok.original.len = 6;
+ }
+ else {
+ ex_tok.original.begin = text + ex->pos;
+ ex_tok.original.len = ex->len;
+ }
+ ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+ g_array_append_val(words, ex_tok);
+
+ /* Move past exception */
+ pos = ex->pos + ex->len;
+ cur_ex = g_list_next(cur_ex);
+ }
+
+ /* Process remaining text after last exception */
+ if (pos < len) {
+ result.positions = NULL;
+ result.count = 0;
+
+ ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
+ if (ret == 0 && result.positions) {
+ /* Convert positions to tokens, adjusting for segment offset */
+ for (i = 0; i < result.count; i++) {
+ rspamd_stat_token_t tok;
+ unsigned int start = result.positions[i * 2] + pos;
+ unsigned int length = result.positions[i * 2 + 1];
+
+ if (start + length <= len) {
+ memset(&tok, 0, sizeof(tok));
+ tok.original.begin = text + start;
+ tok.original.len = length;
+ tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
+ g_array_append_val(words, tok);
+ }
+ }
+
+ /* Use tokenizer's cleanup function */
+ if (tokenizer->api->cleanup_result) {
+ tokenizer->api->cleanup_result(&result);
+ }
+ }
+ }
+
+ return words;
+}
/*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
#include "contrib/mumhash/mum.h"
#include "libmime/lang_detection.h"
#include "libstemmer.h"
+#define RSPAMD_TOKENIZER_INTERNAL
+#include "custom_tokenizer.h"
#include <unicode/utf8.h>
#include <unicode/uchar.h>
static const gsize long_text_limit = 1 * 1024 * 1024;
static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
ev_tstamp start;
+ struct rspamd_custom_tokenizer *custom_tok = NULL;
+ double custom_confidence = 0.0;
+ const char *detected_lang = NULL;
if (text == NULL) {
return cur_words;
res = cur_words;
}
+ /* Try custom tokenizers first if we're in UTF mode */
+ if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) {
+ custom_tok = rspamd_tokenizer_manager_detect(
+ cfg->tokenizer_manager,
+ text, len,
+ &custom_confidence,
+ NULL, /* no input language hint */
+ &detected_lang);
+
+ if (custom_tok && custom_confidence >= custom_tok->min_confidence) {
+ /* Use custom tokenizer with exception handling */
+ GArray *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
+ custom_tok, text, len, exceptions, pool);
+
+ if (custom_res) {
+ msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization",
+ custom_tok->name, custom_confidence);
+
+ /* Calculate hash if needed */
+ if (hash && custom_res->len > 0) {
+ unsigned int i;
+ for (i = 0; i < custom_res->len; i++) {
+ rspamd_stat_token_t *t = &g_array_index(custom_res, rspamd_stat_token_t, i);
+ if (t->original.len >= sizeof(uint64_t)) {
+ uint64_t tmp;
+ memcpy(&tmp, t->original.begin, sizeof(tmp));
+ hv = mum_hash_step(hv, tmp);
+ }
+ }
+ *hash = mum_hash_finish(hv);
+ }
+
+ /* If we had existing words, append to them */
+ if (cur_words && custom_res != cur_words) {
+ g_array_append_vals(cur_words, custom_res->data, custom_res->len);
+ g_array_free(custom_res, TRUE);
+ return cur_words;
+ }
+
+ return custom_res;
+ }
+ else {
+ msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default",
+ custom_tok->name);
+ }
+ }
+ }
+
if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) {
if (l == 0 || (min_len > 0 && l < min_len) ||
}
}
}
-}
\ No newline at end of file
+}