From: Vsevolod Stakhov Date: Fri, 20 Jun 2025 15:50:49 +0000 (+0100) Subject: [Project] Create an isolated API for external tokenizers X-Git-Tag: 3.13.0~56^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5412d3fc0a0c064e5802dbafcbb1fbf9881fbbb8;p=thirdparty%2Frspamd.git [Project] Create an isolated API for external tokenizers --- diff --git a/src/libmime/message.c b/src/libmime/message.c index bac67fb079..8442c80ac8 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -40,6 +40,7 @@ #include "contrib/uthash/utlist.h" #include "contrib/t1ha/t1ha.h" #include "received.h" +#define RSPAMD_TOKENIZER_INTERNAL #include "libstat/tokenizers/custom_tokenizer.h" #define GTUBE_SYMBOL "GTUBE" diff --git a/src/libstat/tokenizers/custom_tokenizer.h b/src/libstat/tokenizers/custom_tokenizer.h index addf089200..bc173a1da4 100644 --- a/src/libstat/tokenizers/custom_tokenizer.h +++ b/src/libstat/tokenizers/custom_tokenizer.h @@ -17,9 +17,18 @@ #ifndef RSPAMD_CUSTOM_TOKENIZER_H #define RSPAMD_CUSTOM_TOKENIZER_H +/* Check if we're being included by internal Rspamd code or external plugins */ +#ifdef RSPAMD_TOKENIZER_INTERNAL +/* Internal Rspamd usage - use the full headers */ #include "config.h" #include "ucl.h" #include "libserver/word.h" +#else +/* External plugin usage - use standalone types */ +#include "rspamd_tokenizer_types.h" +/* Forward declaration for UCL object - plugins should include ucl.h if needed */ +typedef struct ucl_object_s ucl_object_t; +#endif #ifdef __cplusplus extern "C" { @@ -28,8 +37,7 @@ extern "C" { #define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1 /** - * Tokenization result - kvec of rspamd_word_t - * Uses kvec to avoid exposing GLIB structures to external API + * Tokenization result - compatible with both internal and external usage */ typedef rspamd_words_t rspamd_tokenizer_result_t; diff --git a/src/libstat/tokenizers/rspamd_tokenizer_types.h b/src/libstat/tokenizers/rspamd_tokenizer_types.h new file mode 100644 index 0000000000..eb85182909 --- /dev/null +++ b/src/libstat/tokenizers/rspamd_tokenizer_types.h @@ -0,0 +1,89 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_TOKENIZER_TYPES_H +#define RSPAMD_TOKENIZER_TYPES_H + +/* + * Standalone type definitions for custom tokenizers + * This header is completely self-contained and does not depend on any external libraries. + * Custom tokenizers should include only this header to get access to all necessary types. + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Basic string token structure + */ +typedef struct rspamd_ftok { + size_t len; + const char *begin; +} rspamd_ftok_t; + +/** + * Unicode string token structure + */ +typedef struct rspamd_ftok_unicode { + size_t len; + const uint32_t *begin; +} rspamd_ftok_unicode_t; + +/* Word flags */ +#define RSPAMD_WORD_FLAG_TEXT (1u << 0u) +#define RSPAMD_WORD_FLAG_META (1u << 1u) +#define RSPAMD_WORD_FLAG_LUA_META (1u << 2u) +#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3u) +#define RSPAMD_WORD_FLAG_HEADER (1u << 4u) +#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5u) +#define RSPAMD_WORD_FLAG_UTF (1u << 6u) +#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7u) +#define RSPAMD_WORD_FLAG_STEMMED (1u << 8u) +#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9u) +#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10u) +#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11u) +#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12u) +#define RSPAMD_WORD_FLAG_EMOJI (1u << 13u) + +/** + * Word structure + */ +typedef struct rspamd_word { + rspamd_ftok_t original; + rspamd_ftok_unicode_t unicode; + rspamd_ftok_t normalized; + rspamd_ftok_t stemmed; + unsigned int flags; +} rspamd_word_t; + +/** + * Array of words + */ +typedef struct rspamd_words { + rspamd_word_t *a; + size_t n; + size_t m; +} rspamd_words_t; + +#ifdef __cplusplus +} +#endif + +#endif /* RSPAMD_TOKENIZER_TYPES_H */